Skip to content

Commit

Permalink
Merge remote-tracking branch 'master/main' into nanobind
Browse files Browse the repository at this point in the history
Testing pending.
  • Loading branch information
iglesias committed Jun 4, 2024
2 parents 814bf78 + 11df3cc commit a73d276
Show file tree
Hide file tree
Showing 45 changed files with 772 additions and 554 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 3.24)
project (Tapkee LANGUAGES CXX)

# set paths
set (CMAKE_CXX_STANDARD 20)
set (CMAKE_CXX_STANDARD 23)
set (TAPKEE_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include")
set (TAPKEE_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
set (TAPKEE_TESTS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/test/unit")
Expand Down
34 changes: 14 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,9 @@ some examples of usage Tapkee in Shogun as
API
---

We provide an interface based on the method chaining technique. The chain starts from the call
of the `initialize()` method and followed with the `withParameters(const ParametersSet&)` call
which is used to provide parameters like the method to use and its settings. The provided
argument is formed with the following syntax:
We provide an interface based on the method chaining technique. The chain starts with the call
of the `with(const ParametersSet&)` method, which is used to provide parameters like the method
to use and its settings. The provided argument is formed with the following syntax:

(keyword1=value1, keyword2=value2)

Expand All @@ -63,24 +62,23 @@ are defined: `method`, `eigen_method`, `neighbors_method`, `num_neighbors`, `tar
As an example of parameters setting, if you want to use the Isomap
algorithm with the number of neighbors set to 15:

tapkee::initialize().withParameters((method=Isomap,num_neighbors=15))
tapkee::with((method=Isomap,num_neighbors=15))

Please note that the inner parentheses are necessary as it uses the
comma operator which appears to be ambiguous in this case.

Next, with initialized parameters you may either embed the provided matrix with:
Next, you may either embed the provided matrix with:

tapkee::initialize().withParameters((method=Isomap,num_neighbors=15)).
.embedUsing(matrix);
tapkee::with((method=Isomap,num_neighbors=15)).embedUsing(matrix);

Or provide callbacks (kernel, distance and features) using any combination
of the `withKernel(KernelCallback)`, `withDistance(DistanceCallback)` and
`withFeatures(FeaturesCallback)` member functions:

tapkee::initialize().withParameters((method=Isomap,num_neighbors=15))
.withKernel(kernel_callback)
.withDistance(distance_callback)
.withFeatures(features_callback)
tapkee::with((method=Isomap,num_neighbors=15))
.withKernel(kernel_callback)
.withDistance(distance_callback)
.withFeatures(features_callback)

Once callbacks are initialized you may either embed data using an
STL-compatible sequence of indices or objects (that supports the
Expand All @@ -92,17 +90,14 @@ member function.

As a summary - a few examples:

TapkeeOutput output = initialize()
.withParameters((method=Isomap,num_neighbors=15))
TapkeeOutput output = with((method=Isomap,num_neighbors=15))
.embedUsing(matrix);

TapkeeOutput output = initialize()
.withParameters((method=Isomap,num_neighbors=15))
TapkeeOutput output = with((method=Isomap,num_neighbors=15))
.withDistance(distance_callback)
.embedUsing(indices);

TapkeeOutput output = initialize()
.withParameters((method=Isomap,num_neighbors=15))
TapkeeOutput output = with((method=Isomap,num_neighbors=15))
.withDistance(distance_callback)
.embedRange(indices.begin(),indices.end());

Expand Down Expand Up @@ -130,8 +125,7 @@ A minimal working example of a program that uses the library is:

MyDistanceCallback d;

TapkeeOutput output = tapkee::initialize()
.withParameters((method=MultidimensionalScaling,target_dimension=1))
TapkeeOutput output = tapkee::with((method=MultidimensionalScaling,target_dimension=1))
.withDistance(d)
.embedUsing(indices);

Expand Down
51 changes: 30 additions & 21 deletions examples/cbcl/cbcl.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,39 @@
import numpy, datetime, json, subprocess, sys, os, glob
import numpy
import datetime
import json
import subprocess
import sys
import os
import glob
import tempfile

import scipy.misc
from PIL import Image

def load(dir):
images = []
vecs = []
for f in glob.glob(os.path.join(dir,'*.pgm')):
image = numpy.array(scipy.misc.imread(f))
image = numpy.array(Image.open(f))
images.append((f,image))
vecs.append(image.ravel())
return numpy.vstack(vecs), images

def embed(feature_matrix):
input_file = 'tmp_cbcl_input'
numpy.savetxt(input_file,feature_matrix)
output_file = 'tmp_cbcl_output.dat'
run_string = './bin/tapkee_cli -i %s -o %s -m ltsa -k 20 --transpose --verbose --benchmark' % (input_file,output_file)
output = subprocess.check_output(run_string, shell=True)
embedding = numpy.loadtxt(output_file)
os.remove(output_file)
input_file = tempfile.NamedTemporaryFile(prefix='cbcl_input')
output_file = tempfile.NamedTemporaryFile(prefix='cbcl_output')
numpy.savetxt(input_file.name, feature_matrix, delimiter=',')
runner_string = './bin/tapkee -i %s -o %s -m ltsa -k 80 --transpose-output --verbose --benchmark' % (input_file.name, output_file.name)
process = subprocess.run(runner_string, shell=True, capture_output=True, text=True)
print(process.stderr)
if process.returncode != 0:
raise Exception('Failed to embed')
embedding = numpy.loadtxt(output_file.name, delimiter=',')
return embedding

def export_json(outfile,embedding,images):
def export_json(outfile, embedding, images):
json_dict = {}
N = embedding.shape[1]
print 'N', N
import scipy.misc
json_dict['data'] = [{'cx':embedding[0,i], 'cy':embedding[1,i], 'fname':images[i][0]} for i in xrange(N)]
json.dump(json_dict, open(outfile, 'w'))

Expand All @@ -34,21 +43,21 @@ def plot_embedding(embedding,images):
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(embedding[0],embedding[1],alpha=0.0)
for i in xrange(embedding.shape[1]):
img = numpy.zeros((images[i][1].shape[0],images[i][1].shape[1],4))
img[:,:,0] = 255*images[i][1]
img[:,:,1] = 255*images[i][1]
img[:,:,2] = 255*images[i][1]
img[:,:,3] = 1
img[(images[i][1]==28),3] = 0
for i in range(embedding.shape[1]):
img = numpy.zeros((images[i][1].shape[0], images[i][1].shape[1], 4))
img[:,:,0] = images[i][1]/255.0
img[:,:,1] = images[i][1]/255.0
img[:,:,2] = images[i][1]/255.0
img[:,:,3] = 1.0
img[(images[i][1]==28), 3] = 0
imagebox = OffsetImage(img,cmap=plt.cm.gray,zoom=0.2)
ab = AnnotationBbox(imagebox, (embedding[0][i], embedding[1,i]),pad=0.001,frameon=False)
ab = AnnotationBbox(imagebox, (embedding[0][i], embedding[1,i]), pad=0.001, frameon=False)
ax.add_artist(ab)
plt.show()

if __name__ == "__main__":
feature_matrix, images = load('data/cbcl')
embedding = embed(feature_matrix)
if len(sys.argv)==3:
export_json(sys.argv[2],embedding, images)
export_json(sys.argv[2], embedding, images)
plot_embedding(embedding,images)
42 changes: 25 additions & 17 deletions examples/go.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import sys
import os
import subprocess
import re
import tempfile

import numpy as np
from utils import generate_data, plot
Expand All @@ -20,34 +22,40 @@
}

def embed(data,method):
if method not in supported_methods:
raise Exception('Method is not supported by this script')

input_file = 'tapkee_input_data'
output_file = 'tapkee_output_data'
np.savetxt(input_file, data.T,delimiter=',')
input_file = tempfile.NamedTemporaryFile(prefix='tapkee_input')
output_file = tempfile.NamedTemporaryFile(prefix='tapkee_output')
np.savetxt(input_file.name, data.T,delimiter=',')
tapkee_binary = 'bin/tapkee'
runner_string = '%s -i %s -o %s -m %s -k 20 --precompute --verbose --transpose-output --benchmark' % (tapkee_binary, input_file, output_file, method)

print('-- To reproduce this use the following command', runner_string)
output = subprocess.check_output(runner_string, shell=True)
runner_string = '%s -i %s -o %s -m %s -k 20 --precompute --debug --verbose --transpose-output --benchmark' % (
tapkee_binary, input_file.name, output_file.name, method
)
print('-- To reproduce this use the following command `{}`'.format(runner_string))
process = subprocess.run(runner_string, shell=True, capture_output=True, text=True)
print(process.stderr)
if process.returncode != 0:
raise Exception('Failed to embed')

if match := re.search(r'Parameter dimension reduction method = \[([a-zA-Z0-9() ]+)\]', process.stderr):
used_method = match.group(1)
else:
used_method = ''

embedded_data = np.loadtxt(output_file, delimiter=',')
os.remove(input_file)
os.remove(output_file)
return embedded_data
return embedded_data, used_method

if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Graphical example of dimension reduction with Tapkee.')
parser.add_argument('dataset', type=str, nargs=1, help='A dataset to embed. One of the following: %s' % str(['swissroll', 'scurve', 'helix']))
parser.add_argument('method', type=str, nargs=1, help='A method to use. One of the following %s' % str(list(supported_methods.keys())))
parser.add_argument('dataset', type=str, nargs=1, help='A dataset to embed. One of the following: %s' % str(['swissroll', 'scurve', 'helix', 'twinpeaks']))
parser.add_argument('method', type=str, nargs=1, help='A method to use. Any of the methods supported by Tapkee')
args = parser.parse_args()

dataset = args.dataset[0]
method = args.method[0]
print('-- Loading %s data' % dataset)
data, colors = generate_data(dataset)
print('-- Embedding %s data with %s' % (dataset,method))
embedded_data = embed(data, method)
print('-- Embedding %s data with %s' % (dataset, method))
embedded_data, used_method = embed(data, method)
print('-- Plotting embedded data')
plot(data, embedded_data, colors, supported_methods[method])
plot(data, embedded_data, colors, used_method)
3 changes: 1 addition & 2 deletions examples/minimal/minimal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ int main(int argc, const char **argv)

MyDistanceCallback distance;

TapkeeOutput output = initialize()
.withParameters((method = MultidimensionalScaling, target_dimension = 1))
TapkeeOutput output = with((method = MultidimensionalScaling, target_dimension = 1))
.withDistance(distance)
.embedUsing(indices);

Expand Down
5 changes: 2 additions & 3 deletions examples/precomputed/precomputed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using namespace tapkee;
int main(int argc, const char **argv)
{
const int N = 100;
tapkee::DenseMatrix distances(N, N);
DenseMatrix distances(N, N);
vector<IndexType> indices(N);
for (int i = 0; i < N; i++)
{
Expand All @@ -19,8 +19,7 @@ int main(int argc, const char **argv)

precomputed_distance_callback distance(distances);

TapkeeOutput output = initialize()
.withParameters((method = MultidimensionalScaling, target_dimension = 1))
TapkeeOutput output = with((method = MultidimensionalScaling, target_dimension = 1))
.withDistance(distance)
.embedUsing(indices);

Expand Down
3 changes: 1 addition & 2 deletions examples/rna/rna.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ int main(int argc, const char **argv)

MatchKernelCallback kernel;

TapkeeOutput result = initialize()
.withParameters((method = KernelLocallyLinearEmbedding, num_neighbors = 30))
TapkeeOutput result = with((method = KernelLocallyLinearEmbedding, num_neighbors = 30))
.withKernel(kernel)
.embedUsing(rnas);

Expand Down
81 changes: 65 additions & 16 deletions examples/utils.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,91 @@
import matplotlib.pyplot as plt
import numpy as np

def generate_data(type, N=1000):
def generate_data(type, N=1000, random_state=None):
rng = np.random.RandomState(random_state)
if type=='swissroll':
tt = np.array((3*np.pi/2)*(1+2*np.random.rand(N)))
height = np.array((np.random.rand(N)-0.5))
tt = np.array((3*np.pi/2)*(1+2*rng.rand(N)))
height = np.array((rng.rand(N)-0.5))
X = np.array([tt*np.cos(tt), 10*height, tt*np.sin(tt)])
return X,tt
return X, tt
if type=='scurve':
tt = np.array((3*np.pi*(np.random.rand(N)-0.5)))
height = np.array((np.random.rand(N)-0.5))
tt = np.array((3*np.pi*(rng.rand(N)-0.5)))
height = np.array((rng.rand(N)-0.5))
X = np.array([np.sin(tt), 10*height, np.sign(tt)*(np.cos(tt)-1)])
return X,tt
return X, tt
if type=='helix':
tt = np.linspace(1,N,N).T / N
tt = tt*2*np.pi
X = np.r_[[(2+np.cos(8*tt))*np.cos(tt)],
[(2+np.cos(8*tt))*np.sin(tt)],
[np.sin(8*tt)]]
return X,tt
[(2+np.cos(8*tt))*np.sin(tt)],
[np.sin(8*tt)]]
return X, tt
if type=='twinpeaks':
X = rng.uniform(-1, 1, size=(N, 2))
tt = np.sin(np.pi * X[:, 0]) * np.tanh(X[:, 1])
tt += 0.1 * rng.normal(size=tt.shape)
X = np.vstack([X.T, tt])
return X, tt
if type=='klein':
u = rng.uniform(0, 2 * np.pi, N)
v = rng.uniform(0, 2 * np.pi, N)
x = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.cos(u)
y = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.sin(u)
z = np.sin(u / 2) * np.sin(v) + np.cos(u / 2) * np.sin(2 * v)

noise = 0.01
x += noise * rng.normal(size=x.shape)
y += noise * rng.normal(size=y.shape)
z += noise * rng.normal(size=z.shape)
return np.vstack((x, y, z)), u

raise Exception('Dataset is not supported')

def plot(data, embedded_data, colors='m', method=None):
fig = plt.figure()
fig.set_facecolor('white')

ax = fig.add_subplot(121, projection='3d')
ax.scatter(data[0], data[1], data[2], c=colors, cmap=plt.cm.Spectral, s=5)
ax_original = fig.add_subplot(121, projection='3d')
scatter_original = ax_original.scatter(data[0], data[1], data[2], c=colors, cmap=plt.cm.Spectral, s=5, picker=True)
plt.axis('tight')
plt.axis('off')
plt.title('Original', fontsize=9)

ax = fig.add_subplot(122)
ax.scatter(embedded_data[0], embedded_data[1], c=colors, cmap=plt.cm.Spectral, s=5)
ax_embedding = fig.add_subplot(122)
scatter_embedding = ax_embedding.scatter(embedded_data[0], embedded_data[1], c=colors, cmap=plt.cm.Spectral, s=5, picker=True)
plt.axis('tight')
plt.axis('off')
plt.title('Embedding' + (' with ' + method) if method else '', fontsize=9)
plt.title('Embedding' + (' with ' + method) if method else '', fontsize=9, wrap=True)

plt.show()
highlighted_points = [] # To store highlighted points

# Function to highlight points on both plots
def highlight(index):
# Reset previous highlighted points
for point in highlighted_points:
point.remove()
highlighted_points.clear()

# Highlight the current point on both scatter plots
point1 = ax_original.scatter([data[0][index]], [data[1][index]], [data[2][index]], color='white', s=25, edgecolor='black', zorder=3)
point2 = ax_embedding.scatter([embedded_data[0][index]], [embedded_data[1][index]], color='white', s=25, edgecolor='black', zorder=3)
highlighted_points.append(point1)
highlighted_points.append(point2)
fig.canvas.draw_idle()

# Event handler for mouse motion
def on_hover(event):
if event.inaxes == ax_original:
cont, ind = scatter_original.contains(event)
elif event.inaxes == ax_embedding:
cont, ind = scatter_embedding.contains(event)
else:
return

if cont:
index = ind['ind'][0]
highlight(index)

fig.canvas.mpl_connect('motion_notify_event', on_hover)

plt.show()
Loading

0 comments on commit a73d276

Please sign in to comment.