-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7d1677b
commit d2126b3
Showing
17 changed files
with
41,590 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Copyright (c) 2022-present, Zejun Wang ([email protected]) | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
cmake_minimum_required(VERSION 3.10) | ||
cmake_policy(SET CMP0079 NEW) | ||
|
||
project(easytokenizer LANGUAGES CXX C VERSION 0.2.0) | ||
|
||
set(CMAKE_SYSTEM_NAME Linux) | ||
set(CMAKE_SYSTEM_PROCESSOR x86_64) | ||
|
||
#set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/../) | ||
#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../) | ||
|
||
set(CMAKE_CXX_STANDARD 11) | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -fPIC -funroll-loops") | ||
|
||
# OMP | ||
option(WITH_OMP "Compile with OpenMP" OFF) | ||
if (WITH_OMP) | ||
find_package(OpenMP REQUIRED) | ||
add_definitions(-DWITH_OMP) | ||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") | ||
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") | ||
endif() | ||
|
||
message("CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}") | ||
|
||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/tokenizer) | ||
|
||
# add_libray | ||
add_library(tokenizer_static_lib | ||
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/tokenizer.cc | ||
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/utf8proc.c) | ||
target_link_libraries(tokenizer_static_lib pthread) | ||
set_target_properties(tokenizer_static_lib PROPERTIES OUTPUT_NAME tokenizer) | ||
|
||
# add_libray make SHARED | ||
add_library(tokenizer_shared_lib SHARED | ||
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/tokenizer.cc | ||
${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/utf8proc.c) | ||
target_link_libraries(tokenizer_shared_lib pthread) | ||
set_target_properties(tokenizer_shared_lib PROPERTIES OUTPUT_NAME tokenizer) | ||
|
||
# CGO | ||
add_library(tokenizer SHARED easytokenizer_wrapper.cc) | ||
target_link_libraries(tokenizer PUBLIC tokenizer_static_lib) | ||
target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) | ||
install(TARGETS tokenizer DESTINATION ${CMAKE_INSTALL_LIBDIR}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,24 @@ | ||
# easytokenizer-to-go | ||
Golang binding for https://github.com/zejunwang1/easytokenizer | ||
|
||
**But in this project, we make compatible easytokenizer to 0.2.0, and rename some functions, because of C++ function override** | ||
|
||
### Version | ||
|
||
* version 0.1.0 | ||
* Finish API `initTokenizer`, `encode`, `encodeWithIds`, `wordPieceTokenize` in CGO. | ||
* Finish API `NewTokenizer`, `Close`, `Encode`, `EncodeWithIds`, `WordPieceTokenize` in Golang. | ||
|
||
### Build Library | ||
|
||
* Linux/MacOS | ||
* `sh build.sh` | ||
|
||
### Usage | ||
|
||
* When building golang program, please add `export CGO_CXXFLAGS=-std=c++11` command before `go build / run / test ...` | ||
|
||
|
||
### Example | ||
|
||
* In example folder. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
mkdir -p build && cd build | ||
cmake .. | ||
make && make install | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
package tokenizer | ||
|
||
// #cgo CXXFLAGS: -std=c++11 | ||
// #cgo LDFLAGS: -L${SRCDIR} -ltokenizer -lm | ||
// #include "easytokenizer_wrapper.h" | ||
import "C" | ||
import ( | ||
"unsafe" | ||
) | ||
|
||
type EasyTokenizer struct { | ||
easyTokenizer C.EasyTokenizer | ||
vocabPath string | ||
doLowerCase bool | ||
codePointLevel bool | ||
} | ||
|
||
// NewTokenizer init EasyTokenizer | ||
func NewTokenizer(vocabPath string, doLowerCase bool) *EasyTokenizer { | ||
var tokenizer EasyTokenizer | ||
tokenizer.vocabPath = vocabPath | ||
tokenizer.doLowerCase = doLowerCase | ||
// just true | ||
tokenizer.codePointLevel = true | ||
// initTokenizer | ||
tokenizer.easyTokenizer = C.initTokenizer(C.CString(vocabPath), C.bool(tokenizer.doLowerCase), C.bool(tokenizer.codePointLevel)) | ||
return &tokenizer | ||
} | ||
|
||
func (t *EasyTokenizer) Close() { | ||
C.free(unsafe.Pointer(t.easyTokenizer)) | ||
t.easyTokenizer = nil | ||
} | ||
|
||
func (t *EasyTokenizer) Encode(text string, maxSeqLength int) []int32 { | ||
// text | ||
cText := C.CString(text) | ||
defer C.free(unsafe.Pointer(cText)) | ||
// make allocation of []int32 slice | ||
outputData := make([]int32, maxSeqLength) | ||
// call function | ||
C.encode(t.easyTokenizer, cText, C.bool(true), C.bool(true), C.int(maxSeqLength), (*C.int)(unsafe.Pointer(&outputData[0]))) | ||
return outputData | ||
} | ||
|
||
func (t *EasyTokenizer) EncodeWithIds(text string, maxSeqLength int) ([]int32, []int32, []int32, []int32) { | ||
// text | ||
cText := C.CString(text) | ||
defer C.free(unsafe.Pointer(cText)) | ||
// inputIds, tokenTypeIds, attentionMask, offsets | ||
var inputIds, tokenTypeIds, attentionMask, offsets *C.int | ||
// slice number of inputIds, tokenTypeIds, attentionMask, offsets | ||
var numInputIds, numTokenTypeIds, numAttentionMask, numOffsets C.int | ||
// call function | ||
C.encodeWithIds(t.easyTokenizer, cText, | ||
&inputIds, &numInputIds, | ||
&tokenTypeIds, &numTokenTypeIds, | ||
&attentionMask, &numAttentionMask, | ||
&offsets, &numOffsets, | ||
C.bool(true), C.bool(true), C.bool(true), C.int(maxSeqLength)) | ||
// to Golang Slice | ||
sliceInputIds := (*[1 << 30]int32)(unsafe.Pointer(inputIds))[:numInputIds:numInputIds] | ||
sliceTokenTypeIds := (*[1 << 30]int32)(unsafe.Pointer(tokenTypeIds))[:numTokenTypeIds:numTokenTypeIds] | ||
sliceAttentionMask := (*[1 << 30]int32)(unsafe.Pointer(attentionMask))[:numAttentionMask:numAttentionMask] | ||
sliceOffsets := (*[1 << 30]int32)(unsafe.Pointer(offsets))[:numOffsets:numOffsets] | ||
// release | ||
defer C.free(unsafe.Pointer(inputIds)) | ||
defer C.free(unsafe.Pointer(tokenTypeIds)) | ||
defer C.free(unsafe.Pointer(attentionMask)) | ||
defer C.free(unsafe.Pointer(offsets)) | ||
return sliceInputIds, sliceTokenTypeIds, sliceAttentionMask, sliceOffsets | ||
} | ||
|
||
func (t *EasyTokenizer) WordPieceTokenize(text string) ([]string, []int32) { | ||
// text | ||
cText := C.CString(text) | ||
defer C.free(unsafe.Pointer(cText)) | ||
// init variable | ||
var tokens **C.char | ||
var offsets *C.int | ||
var numTokens, numOffsets C.int | ||
// call function | ||
C.wordPieceTokenize(t.easyTokenizer, cText, &tokens, &numTokens, &offsets, &numOffsets) | ||
defer C.free(unsafe.Pointer(tokens)) | ||
defer C.free(unsafe.Pointer(offsets)) | ||
// to Golang Slice | ||
sliceTokens := (*[1 << 30]*C.char)(unsafe.Pointer(tokens))[:numTokens:numTokens] | ||
sliceOffsets := (*[1 << 30]int32)(unsafe.Pointer(offsets))[:numOffsets:numOffsets] | ||
// parse string token | ||
goTokens := make([]string, numTokens) | ||
for i := 0; i < int(numTokens); i++ { | ||
goTokens[i] = C.GoString(sliceTokens[i]) | ||
} | ||
sliceTokens = nil | ||
return goTokens, sliceOffsets | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
// | ||
// Created by sunhailin-Leo on 2023/3/6. | ||
// | ||
#include <string> | ||
#include <vector> | ||
#include "easytokenizer_wrapper.h" | ||
#include "tokenizer/tokenizer.h" | ||
|
||
EasyTokenizer initTokenizer(char* vocab_path, bool do_lower_case, bool codepoint_level) { | ||
tokenizer::Tokenizer *tk = new tokenizer::Tokenizer(vocab_path, do_lower_case, codepoint_level); | ||
return (void *) tk; | ||
} | ||
|
||
void encode(EasyTokenizer tokenizer, char* text, bool add_cls_sep, bool truncation, int max_length, int* output_data) { | ||
auto data = ((tokenizer::Tokenizer *) tokenizer)->encode(text, add_cls_sep, truncation, max_length); | ||
|
||
std::vector<int>* vec = new std::vector<int>(data.begin(), data.end()); | ||
int size = vec->size(); | ||
for (int i = 0; i < size; i++) { | ||
output_data[i] = (*vec)[i]; | ||
} | ||
|
||
delete vec; | ||
} | ||
|
||
void encodeWithIds( | ||
EasyTokenizer tokenizer, | ||
char* text, | ||
int** input_ids, | ||
int* num_input_ids, | ||
int** token_type_ids, | ||
int* num_token_type_ids, | ||
int** attention_mask, | ||
int* num_attention_mask, | ||
int** offsets, | ||
int* num_offsets, | ||
bool add_cls_sep, | ||
bool padding, | ||
bool truncation, | ||
int max_length) { | ||
std::vector<tokenizer::SizeT> input_ids_vector; | ||
std::vector<tokenizer::SizeT> token_type_ids_vector; | ||
std::vector<tokenizer::SizeT> attention_mask_vector; | ||
std::vector<tokenizer::SizeT> offsets_vector; | ||
|
||
((tokenizer::Tokenizer *) tokenizer)->encodeWithIds(text, input_ids_vector, token_type_ids_vector, attention_mask_vector, offsets_vector, add_cls_sep, padding, truncation, max_length); | ||
|
||
// input_ids_vector | ||
*num_input_ids = input_ids_vector.size(); | ||
*input_ids = (int*) malloc(sizeof(int) * (*num_input_ids)); | ||
memcpy(*input_ids, input_ids_vector.data(), sizeof(int) * (*num_input_ids)); | ||
|
||
// token_type_ids_vector | ||
*num_token_type_ids = token_type_ids_vector.size(); | ||
*token_type_ids = (int*) malloc(sizeof(int) * (*num_token_type_ids)); | ||
memcpy(*token_type_ids, token_type_ids_vector.data(), sizeof(int) * (*num_token_type_ids)); | ||
|
||
// attention_mask_vector | ||
*num_attention_mask = attention_mask_vector.size(); | ||
*attention_mask = (int*) malloc(sizeof(int) * (*num_attention_mask)); | ||
memcpy(*attention_mask, attention_mask_vector.data(), sizeof(int) * (*num_attention_mask)); | ||
|
||
// offsets_vector | ||
*num_offsets = offsets_vector.size(); | ||
*offsets = (int*) malloc(sizeof(int) * (*num_offsets)); | ||
memcpy(*offsets, offsets_vector.data(), sizeof(int) * (*num_offsets)); | ||
} | ||
|
||
void wordPieceTokenize(EasyTokenizer tokenizer, char* text, char*** tokens, int* num_tokens, int** offsets, int* num_offsets) { | ||
std::vector<std::string> token_vector; | ||
std::vector<tokenizer::SizeT> offset_vector; | ||
((tokenizer::Tokenizer *) tokenizer)->wordpiece_tokenize_with_offsets(text, token_vector, offset_vector); | ||
|
||
*num_tokens = token_vector.size(); | ||
*tokens = (char**)malloc(sizeof(char*) * (*num_tokens)); | ||
for (int i = 0; i < *num_tokens; i++) { | ||
(*tokens)[i] = (char*)malloc(token_vector[i].size() + 1); | ||
strcpy((*tokens)[i], token_vector[i].c_str()); | ||
} | ||
|
||
*num_offsets = offset_vector.size(); | ||
*offsets = (int*)malloc(sizeof(int) * (*num_offsets)); | ||
memcpy(*offsets, offset_vector.data(), sizeof(int) * (*num_offsets)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
// | ||
// Created by sunhailin-Leo on 2023/3/6. | ||
// | ||
#include <stdlib.h> | ||
#include <stdbool.h> | ||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
typedef void *EasyTokenizer; | ||
|
||
EasyTokenizer initTokenizer(char* vocab_path, bool do_lower_case, bool codepoint_level); | ||
|
||
void encode(EasyTokenizer tokenizer, char* text, bool add_cls_sep, bool truncation, int max_length, int* output_data); | ||
|
||
void encodeWithIds( | ||
EasyTokenizer tokenizer, | ||
char* text, | ||
int** input_ids, | ||
int* num_input_ids, | ||
int** token_type_ids, | ||
int* num_token_type_ids, | ||
int** attention_mask, | ||
int* num_attention_mask, | ||
int** offsets, | ||
int* num_offsets, | ||
bool add_cls_sep, | ||
bool padding, | ||
bool truncation, | ||
int max_length); | ||
|
||
void wordPieceTokenize(EasyTokenizer tokenizer, char* text, char*** tokens, int* num_tokens, int** offsets, int* num_offsets); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif |
Oops, something went wrong.