Skip to content

Commit

Permalink
Fix indexing issues in FFM
Browse files Browse the repository at this point in the history
  • Loading branch information
mdymczyk committed May 31, 2018
1 parent a127759 commit d2b97a3
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 115 deletions.
62 changes: 57 additions & 5 deletions src/base/ffm/ffm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,44 +38,96 @@ void FFM<T>::predict(const Dataset<T> &dataset, T *predictions) {
trainer.predict(predictions);
}

template <typename T>
void computeScales(T *scales, const T* values, const int *rowPositions, Params &_param) {
#pragma omp parallel for
for(int i = 0; i < _param.numRows; i++) {
if(_param.normalize) {
T scale = 0.0;
for (int e = rowPositions[i]; e < rowPositions[i + 1]; e++) {
scale += values[e] * values[e];
}
scales[i] = 1.0 / scale;
} else {
scales[i] = 1.0;
}
}
}

template <typename T>
T maxElement(const T *data, int size) {
T maxVal = 0.0;

#pragma omp parallel for reduction(max : maxVal)
for (int i = 0; i < size; i++) {
if (data[i] > maxVal) {
maxVal = data[i];
}
}

return maxVal;
}

/**
* C API method
*/
void ffm_fit_float(int* features, int* fields, float* values, int *labels, float *scales, int *rowPositions, float *w, Params &_param) {
void ffm_fit_float(int* features, int* fields, float* values, int *labels, int *rowPositions, float *w, Params &_param) {
log_debug(_param.verbose, "Converting %d float rows into a dataset.", _param.numRows);
float *scales = (float*) malloc(sizeof(float) * _param.numRows);
computeScales(scales, values, rowPositions, _param);

_param.numFields = maxElement(fields, _param.numNodes) + 1;
_param.numFeatures = maxElement(features, _param.numNodes) + 1;

Dataset<float> dataset(_param.numFields, _param.numFeatures, _param.numRows, _param.numNodes, features, fields, values, labels, scales, rowPositions);
FFM<float> ffm(_param);
_param.printParams();
log_debug(_param.verbose, "Running FFM fit for float.");
Timer timer;
timer.tic();
ffm.fit(dataset);
ffm.trainer.model->copyTo(w);
timer.toc();
log_debug(_param.verbose, "Float fit took %f.", timer.pop());
ffm.trainer.model->copyTo(w);
}

void ffm_fit_double(int* features, int* fields, double* values, int *labels, double *scales, int *rowPositions, double *w, Params &_param) {
void ffm_fit_double(int* features, int* fields, double* values, int *labels, int *rowPositions, double *w, Params &_param) {
log_debug(_param.verbose, "Converting %d double rows into a dataset.", _param.numRows);
double *scales = (double*) malloc(sizeof(double) * _param.numRows);
computeScales(scales, values, rowPositions, _param);

_param.numFields = maxElement(fields, _param.numNodes) + 1;
_param.numFeatures = maxElement(features, _param.numNodes) + 1;

Dataset<double> dataset(_param.numFields, _param.numFeatures, _param.numRows, _param.numNodes, features, fields, values, labels, scales, rowPositions);
FFM<double> ffm(_param);
_param.printParams();
log_debug(_param.verbose, "Running FFM fit for double.");
Timer timer;
timer.tic();
ffm.fit(dataset);
ffm.trainer.model->copyTo(w);
timer.toc();
log_debug(_param.verbose, "Double fit took %f.", timer.pop());
}

void ffm_predict_float(int *features, int* fields, float* values, float *scales, int* rowPositions, float *predictions, float *w, Params &_param) {
void ffm_predict_float(int *features, int* fields, float* values, int* rowPositions, float *predictions, float *w, Params &_param) {
log_debug(_param.verbose, "Converting %d float rows into a dataset for predictions.", _param.numRows);
float *scales = (float*) malloc(sizeof(float) * _param.numRows);
computeScales(scales, values, rowPositions, _param);

Dataset<float> dataset(_param.numFields, _param.numFeatures, _param.numRows, _param.numNodes, features, fields, values, nullptr, scales, rowPositions);
FFM<float> ffm(_param, w);
_param.printParams();
log_debug(_param.verbose, "Running FFM predict for float.");
ffm.predict(dataset, predictions);
}

void ffm_predict_double(int *features, int* fields, double* values, double *scales, int* rowPositions, double *predictions, double *w, Params &_param){
void ffm_predict_double(int *features, int* fields, double* values, int* rowPositions, double *predictions, double *w, Params &_param){
log_debug(_param.verbose, "Converting %d double rows into a dataset for predictions.", _param.numRows);
double *scales = (double*) malloc(sizeof(double) * _param.numRows);
computeScales(scales, values, rowPositions, _param);

Dataset<double> dataset(_param.numFields, _param.numFeatures, _param.numRows, _param.numNodes, features, fields, values, nullptr, scales, rowPositions);
FFM<double> ffm(_param, w);
_param.printParams();
Expand Down
19 changes: 8 additions & 11 deletions src/base/ffm/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
namespace ffm {

template<typename T>
Model<T>::Model(Params &params) : weights(2 * params.numFeatures * params.numFields * params.k) {
Model<T>::Model(Params &params) : weights(2 * (size_t)params.numFeatures * (size_t)params.numFields * (size_t)params.k) {
this->numFeatures = params.numFeatures;
this->numFields = params.numFields;
this->k = params.k;
Expand All @@ -21,31 +21,28 @@ Model<T>::Model(Params &params) : weights(2 * params.numFeatures * params.numFie
std::default_random_engine generator(params.seed);
std::uniform_real_distribution<T> distribution(0.0, 1.0);

for (int i = 0; i < weights.size(); i += 2) {
for (size_t i = 0; i < weights.size(); i += 2) {
this->weights[i] = coef * distribution(generator);
this->weights[i + 1] = 1.0;
}

}

template<typename T>
Model<T>::Model(Params &params, const T *_weights) : weights(params.numFeatures * params.numFields * params.k) {
Model<T>::Model(Params &params, const T *_weights) : weights(_weights, _weights + (size_t)params.numFeatures * params.numFields * params.k) {
this->numFeatures = params.numFeatures;
this->numFields = params.numFields;
this->k = params.k;
this->normalize = params.normalize;

for (int i = 0; i < this->weights.size(); i++) {
// TODO memcpy?
this->weights[i] = _weights[i];
}
}

template<typename T>
void Model<T>::copyTo(T *dstWeights) {
// Copy only weights
std::copy_if( this->weights.begin(), this->weights.end(), dstWeights,
[]( int x ) { return x % 2; } );
// TODO make this faster, maybe some copy_if with index
#pragma omp parallel for
for(size_t i = 0; i < this->weights.size(); i+=2) {
dstWeights[i / 2] = this->weights.data()[i];
}
};

template<typename T>
Expand Down
14 changes: 10 additions & 4 deletions src/gpu/ffm/batching_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "../../common/timer.h"
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/adjacent_difference.h>

namespace ffm {
Expand All @@ -18,11 +19,12 @@ namespace ffm {
*/
template <typename T>
int DatasetBatchGPU<T>::widestRow() {
thrust::device_vector<int> tmpRowSizes(this->numRows);
thrust::device_vector<int> rowPositions(this->rowPositions, this->rowPositions + this->numRows);
thrust::device_vector<int> tmpRowSizes(this->numRows + 1);
thrust::device_vector<int> rowPositions(this->rowPositions, this->rowPositions + this->numRows + 1);
thrust::adjacent_difference(rowPositions.begin(), rowPositions.end(), tmpRowSizes.begin(), thrust::minus<int>());

thrust::device_vector<int>::iterator iter = thrust::max_element(tmpRowSizes.begin(), tmpRowSizes.end());
// Don't take into account 1st difference as it's equal to the first row's size
thrust::device_vector<int>::iterator iter = thrust::max_element(tmpRowSizes.begin() + 1, tmpRowSizes.end());

int max_value = *iter;

Expand Down Expand Up @@ -107,7 +109,11 @@ DatasetBatch<T> *DatasetBatcherGPU<T>::nextBatch(int batchSize) {
actualBatchSize,
batchSize);

DatasetBatchGPU<T> *batch = new DatasetBatchGPU<T>(this->dataset.features + this->pos, this->dataset.fields + this->pos, this->dataset.values + this->pos,

int moveBy = 0;
CUDA_CHECK(cudaMemcpy(&moveBy, &(this->dataset.rowPositions[this->pos]), sizeof(int), cudaMemcpyDeviceToHost));

DatasetBatchGPU<T> *batch = new DatasetBatchGPU<T>(this->dataset.features + moveBy, this->dataset.fields + moveBy, this->dataset.values + moveBy,
this->dataset.labels + this->pos, this->dataset.scales + this->pos, this->dataset.rowPositions + this->pos,
actualBatchSize);
this->pos = this->pos + actualBatchSize;
Expand Down
16 changes: 9 additions & 7 deletions src/gpu/ffm/model_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ template<typename T>
class ModelGPU : public Model<T> {
public:
ModelGPU(Params &params) : Model<T>(params), localWeights(params.nGpus) {
#pragma omp for
#pragma omp parallel for
for(int i = 0; i < params.nGpus; i++) {
log_verbose(params.verbose, "Copying weights of size %zu to GPU %d for predictions", this->weights.size(), i);
localWeights[i] = new thrust::device_vector<T>(this->weights.size());
Expand All @@ -25,7 +25,7 @@ class ModelGPU : public Model<T> {
}

ModelGPU(Params &params, const T *weights) : Model<T>(params, weights), localWeights(params.nGpus) {
#pragma omp for
#pragma omp parallel for
for(int i = 0; i < params.nGpus; i++) {
log_verbose(params.verbose, "Copying weights of size %zu to GPU %d for predictions", this->weights.size(), i);
localWeights[i] = new thrust::device_vector<T>(this->weights.size());
Expand All @@ -35,7 +35,6 @@ class ModelGPU : public Model<T> {
}

~ModelGPU() {
#pragma omp for
for(int i = 0; i < localWeights.size(); i++) {
delete localWeights[i];
}
Expand All @@ -47,13 +46,16 @@ class ModelGPU : public Model<T> {
// TODO probably can be done with counting iterator
thrust::device_vector<int> indices(this->localWeights[0]->size());
thrust::sequence(indices.begin(), indices.end(), 0, 1);
thrust::device_vector<T> onlyWeights(indices.size() / 2);

thrust::copy_if(
thrust::raw_pointer_cast(this->localWeights[0]->data()),
thrust::raw_pointer_cast(this->localWeights[0]->data()) + this->localWeights[0]->size(),
thrust::raw_pointer_cast(indices.data()),
dstWeights,
this->localWeights[0]->begin(),
this->localWeights[0]->end(),
indices.begin(),
onlyWeights.begin(),
[=] __device__(int idx) { return idx % 2 == 0; });

CUDA_CHECK(cudaMemcpy(dstWeights, thrust::raw_pointer_cast(onlyWeights.data()), onlyWeights.size() * sizeof(T), cudaMemcpyDeviceToHost));
};

T* weightsRawPtr(int i) override {
Expand Down
Loading

0 comments on commit d2b97a3

Please sign in to comment.