Skip to content

Commit

Permalink
version 0.1.0 - initialize project
Browse files Browse the repository at this point in the history
  • Loading branch information
sunhailin-Leo committed Mar 13, 2023
1 parent 7d1677b commit d2126b3
Show file tree
Hide file tree
Showing 17 changed files with 41,590 additions and 0 deletions.
52 changes: 52 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2022-present, Zejun Wang ([email protected])
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.10)
cmake_policy(SET CMP0079 NEW)

project(easytokenizer LANGUAGES CXX C VERSION 0.2.0)

set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR x86_64)

#set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/../)
#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -fPIC -funroll-loops")

# OMP
option(WITH_OMP "Compile with OpenMP" OFF)
if (WITH_OMP)
find_package(OpenMP REQUIRED)
add_definitions(-DWITH_OMP)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()

message("CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}")

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/tokenizer)

# add_libray
add_library(tokenizer_static_lib
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/tokenizer.cc
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/utf8proc.c)
target_link_libraries(tokenizer_static_lib pthread)
set_target_properties(tokenizer_static_lib PROPERTIES OUTPUT_NAME tokenizer)

# add_libray make SHARED
add_library(tokenizer_shared_lib SHARED
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/tokenizer.cc
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/utf8proc.c)
target_link_libraries(tokenizer_shared_lib pthread)
set_target_properties(tokenizer_shared_lib PROPERTIES OUTPUT_NAME tokenizer)

# CGO
add_library(tokenizer SHARED easytokenizer_wrapper.cc)
target_link_libraries(tokenizer PUBLIC tokenizer_static_lib)
target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
install(TARGETS tokenizer DESTINATION ${CMAKE_INSTALL_LIBDIR})
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,24 @@
# easytokenizer-to-go
Golang binding for https://github.com/zejunwang1/easytokenizer

**But in this project, we make compatible easytokenizer to 0.2.0, and rename some functions, because of C++ function override**

### Version

* version 0.1.0
* Finish API `initTokenizer`, `encode`, `encodeWithIds`, `wordPieceTokenize` in CGO.
* Finish API `NewTokenizer`, `Close`, `Encode`, `EncodeWithIds`, `WordPieceTokenize` in Golang.

### Build Library

* Linux/MacOS
* `sh build.sh`

### Usage

* When building golang program, please add `export CGO_CXXFLAGS=-std=c++11` command before `go build / run / test ...`


### Example

* In example folder.
5 changes: 5 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
mkdir -p build && cd build
cmake ..
make && make install

96 changes: 96 additions & 0 deletions easytokenizer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package tokenizer

// #cgo CXXFLAGS: -std=c++11
// #cgo LDFLAGS: -L${SRCDIR} -ltokenizer -lm
// #include "easytokenizer_wrapper.h"
import "C"
import (
"unsafe"
)

type EasyTokenizer struct {
easyTokenizer C.EasyTokenizer
vocabPath string
doLowerCase bool
codePointLevel bool
}

// NewTokenizer init EasyTokenizer
func NewTokenizer(vocabPath string, doLowerCase bool) *EasyTokenizer {
var tokenizer EasyTokenizer
tokenizer.vocabPath = vocabPath
tokenizer.doLowerCase = doLowerCase
// just true
tokenizer.codePointLevel = true
// initTokenizer
tokenizer.easyTokenizer = C.initTokenizer(C.CString(vocabPath), C.bool(tokenizer.doLowerCase), C.bool(tokenizer.codePointLevel))
return &tokenizer
}

func (t *EasyTokenizer) Close() {
C.free(unsafe.Pointer(t.easyTokenizer))
t.easyTokenizer = nil
}

func (t *EasyTokenizer) Encode(text string, maxSeqLength int) []int32 {
// text
cText := C.CString(text)
defer C.free(unsafe.Pointer(cText))
// make allocation of []int32 slice
outputData := make([]int32, maxSeqLength)
// call function
C.encode(t.easyTokenizer, cText, C.bool(true), C.bool(true), C.int(maxSeqLength), (*C.int)(unsafe.Pointer(&outputData[0])))
return outputData
}

func (t *EasyTokenizer) EncodeWithIds(text string, maxSeqLength int) ([]int32, []int32, []int32, []int32) {
// text
cText := C.CString(text)
defer C.free(unsafe.Pointer(cText))
// inputIds, tokenTypeIds, attentionMask, offsets
var inputIds, tokenTypeIds, attentionMask, offsets *C.int
// slice number of inputIds, tokenTypeIds, attentionMask, offsets
var numInputIds, numTokenTypeIds, numAttentionMask, numOffsets C.int
// call function
C.encodeWithIds(t.easyTokenizer, cText,
&inputIds, &numInputIds,
&tokenTypeIds, &numTokenTypeIds,
&attentionMask, &numAttentionMask,
&offsets, &numOffsets,
C.bool(true), C.bool(true), C.bool(true), C.int(maxSeqLength))
// to Golang Slice
sliceInputIds := (*[1 << 30]int32)(unsafe.Pointer(inputIds))[:numInputIds:numInputIds]
sliceTokenTypeIds := (*[1 << 30]int32)(unsafe.Pointer(tokenTypeIds))[:numTokenTypeIds:numTokenTypeIds]
sliceAttentionMask := (*[1 << 30]int32)(unsafe.Pointer(attentionMask))[:numAttentionMask:numAttentionMask]
sliceOffsets := (*[1 << 30]int32)(unsafe.Pointer(offsets))[:numOffsets:numOffsets]
// release
defer C.free(unsafe.Pointer(inputIds))
defer C.free(unsafe.Pointer(tokenTypeIds))
defer C.free(unsafe.Pointer(attentionMask))
defer C.free(unsafe.Pointer(offsets))
return sliceInputIds, sliceTokenTypeIds, sliceAttentionMask, sliceOffsets
}

func (t *EasyTokenizer) WordPieceTokenize(text string) ([]string, []int32) {
// text
cText := C.CString(text)
defer C.free(unsafe.Pointer(cText))
// init variable
var tokens **C.char
var offsets *C.int
var numTokens, numOffsets C.int
// call function
C.wordPieceTokenize(t.easyTokenizer, cText, &tokens, &numTokens, &offsets, &numOffsets)
defer C.free(unsafe.Pointer(tokens))
defer C.free(unsafe.Pointer(offsets))
// to Golang Slice
sliceTokens := (*[1 << 30]*C.char)(unsafe.Pointer(tokens))[:numTokens:numTokens]
sliceOffsets := (*[1 << 30]int32)(unsafe.Pointer(offsets))[:numOffsets:numOffsets]
// parse string token
goTokens := make([]string, numTokens)
for i := 0; i < int(numTokens); i++ {
goTokens[i] = C.GoString(sliceTokens[i])
}
sliceTokens = nil
return goTokens, sliceOffsets
}
84 changes: 84 additions & 0 deletions easytokenizer_wrapper.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
//
// Created by sunhailin-Leo on 2023/3/6.
//
#include <string>
#include <vector>
#include "easytokenizer_wrapper.h"
#include "tokenizer/tokenizer.h"

EasyTokenizer initTokenizer(char* vocab_path, bool do_lower_case, bool codepoint_level) {
tokenizer::Tokenizer *tk = new tokenizer::Tokenizer(vocab_path, do_lower_case, codepoint_level);
return (void *) tk;
}

void encode(EasyTokenizer tokenizer, char* text, bool add_cls_sep, bool truncation, int max_length, int* output_data) {
auto data = ((tokenizer::Tokenizer *) tokenizer)->encode(text, add_cls_sep, truncation, max_length);

std::vector<int>* vec = new std::vector<int>(data.begin(), data.end());
int size = vec->size();
for (int i = 0; i < size; i++) {
output_data[i] = (*vec)[i];
}

delete vec;
}

void encodeWithIds(
EasyTokenizer tokenizer,
char* text,
int** input_ids,
int* num_input_ids,
int** token_type_ids,
int* num_token_type_ids,
int** attention_mask,
int* num_attention_mask,
int** offsets,
int* num_offsets,
bool add_cls_sep,
bool padding,
bool truncation,
int max_length) {
std::vector<tokenizer::SizeT> input_ids_vector;
std::vector<tokenizer::SizeT> token_type_ids_vector;
std::vector<tokenizer::SizeT> attention_mask_vector;
std::vector<tokenizer::SizeT> offsets_vector;

((tokenizer::Tokenizer *) tokenizer)->encodeWithIds(text, input_ids_vector, token_type_ids_vector, attention_mask_vector, offsets_vector, add_cls_sep, padding, truncation, max_length);

// input_ids_vector
*num_input_ids = input_ids_vector.size();
*input_ids = (int*) malloc(sizeof(int) * (*num_input_ids));
memcpy(*input_ids, input_ids_vector.data(), sizeof(int) * (*num_input_ids));

// token_type_ids_vector
*num_token_type_ids = token_type_ids_vector.size();
*token_type_ids = (int*) malloc(sizeof(int) * (*num_token_type_ids));
memcpy(*token_type_ids, token_type_ids_vector.data(), sizeof(int) * (*num_token_type_ids));

// attention_mask_vector
*num_attention_mask = attention_mask_vector.size();
*attention_mask = (int*) malloc(sizeof(int) * (*num_attention_mask));
memcpy(*attention_mask, attention_mask_vector.data(), sizeof(int) * (*num_attention_mask));

// offsets_vector
*num_offsets = offsets_vector.size();
*offsets = (int*) malloc(sizeof(int) * (*num_offsets));
memcpy(*offsets, offsets_vector.data(), sizeof(int) * (*num_offsets));
}

void wordPieceTokenize(EasyTokenizer tokenizer, char* text, char*** tokens, int* num_tokens, int** offsets, int* num_offsets) {
std::vector<std::string> token_vector;
std::vector<tokenizer::SizeT> offset_vector;
((tokenizer::Tokenizer *) tokenizer)->wordpiece_tokenize_with_offsets(text, token_vector, offset_vector);

*num_tokens = token_vector.size();
*tokens = (char**)malloc(sizeof(char*) * (*num_tokens));
for (int i = 0; i < *num_tokens; i++) {
(*tokens)[i] = (char*)malloc(token_vector[i].size() + 1);
strcpy((*tokens)[i], token_vector[i].c_str());
}

*num_offsets = offset_vector.size();
*offsets = (int*)malloc(sizeof(int) * (*num_offsets));
memcpy(*offsets, offset_vector.data(), sizeof(int) * (*num_offsets));
}
35 changes: 35 additions & 0 deletions easytokenizer_wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//
// Created by sunhailin-Leo on 2023/3/6.
//
#include <stdlib.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef void *EasyTokenizer;

EasyTokenizer initTokenizer(char* vocab_path, bool do_lower_case, bool codepoint_level);

void encode(EasyTokenizer tokenizer, char* text, bool add_cls_sep, bool truncation, int max_length, int* output_data);

void encodeWithIds(
EasyTokenizer tokenizer,
char* text,
int** input_ids,
int* num_input_ids,
int** token_type_ids,
int* num_token_type_ids,
int** attention_mask,
int* num_attention_mask,
int** offsets,
int* num_offsets,
bool add_cls_sep,
bool padding,
bool truncation,
int max_length);

void wordPieceTokenize(EasyTokenizer tokenizer, char* text, char*** tokens, int* num_tokens, int** offsets, int* num_offsets);

#ifdef __cplusplus
}
#endif
Loading

0 comments on commit d2126b3

Please sign in to comment.