version 0.1.0 - initialize project

sunhailin-Leo · Mar 13, 2023 · d2126b3 · d2126b3
1 parent 7d1677b
commit d2126b3
Show file tree

Hide file tree

Showing 17 changed files with 41,590 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,52 @@
+# Copyright (c) 2022-present, Zejun Wang ([email protected])
+# All rights reserved.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.10)
+cmake_policy(SET CMP0079 NEW)
+
+project(easytokenizer LANGUAGES CXX C VERSION 0.2.0)
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+
+#set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/../)
+#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -fPIC -funroll-loops")
+
+# OMP
+option(WITH_OMP  "Compile with OpenMP"  OFF)
+if (WITH_OMP)
+    find_package(OpenMP REQUIRED)
+    add_definitions(-DWITH_OMP)
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+message("CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}")
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/tokenizer)
+
+# add_libray
+add_library(tokenizer_static_lib 
+    ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/tokenizer.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/utf8proc.c)
+target_link_libraries(tokenizer_static_lib pthread)
+set_target_properties(tokenizer_static_lib PROPERTIES OUTPUT_NAME tokenizer)
+
+# add_libray make SHARED
+add_library(tokenizer_shared_lib SHARED 
+    ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/tokenizer.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer/utf8proc.c)
+target_link_libraries(tokenizer_shared_lib pthread)
+set_target_properties(tokenizer_shared_lib PROPERTIES OUTPUT_NAME tokenizer)
+
+# CGO
+add_library(tokenizer SHARED easytokenizer_wrapper.cc)
+target_link_libraries(tokenizer PUBLIC tokenizer_static_lib)
+target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+install(TARGETS tokenizer DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/README.md b/README.md
@@ -1,2 +1,24 @@
 # easytokenizer-to-go
 Golang binding for https://github.com/zejunwang1/easytokenizer
+
+**But in this project, we make compatible easytokenizer to 0.2.0, and rename some functions, because of C++ function override**
+
+### Version
+
+* version 0.1.0
+  * Finish API `initTokenizer`, `encode`, `encodeWithIds`, `wordPieceTokenize` in CGO.
+  * Finish API `NewTokenizer`, `Close`, `Encode`, `EncodeWithIds`, `WordPieceTokenize` in Golang.
+
+### Build Library
+
+* Linux/MacOS
+	* `sh build.sh`
+
+### Usage
+
+* When building golang program, please add `export CGO_CXXFLAGS=-std=c++11` command before `go build / run / test ...`
+
+
+### Example
+
+* In example folder.
diff --git a/build.sh b/build.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+mkdir -p build && cd build
+cmake ..
+make && make install
+
diff --git a/easytokenizer.go b/easytokenizer.go
@@ -0,0 +1,96 @@
+package tokenizer
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo LDFLAGS: -L${SRCDIR} -ltokenizer -lm
+// #include "easytokenizer_wrapper.h"
+import "C"
+import (
+	"unsafe"
+)
+
+type EasyTokenizer struct {
+	easyTokenizer  C.EasyTokenizer
+	vocabPath      string
+	doLowerCase    bool
+	codePointLevel bool
+}
+
+// NewTokenizer init EasyTokenizer
+func NewTokenizer(vocabPath string, doLowerCase bool) *EasyTokenizer {
+	var tokenizer EasyTokenizer
+	tokenizer.vocabPath = vocabPath
+	tokenizer.doLowerCase = doLowerCase
+	// just true
+	tokenizer.codePointLevel = true
+	// initTokenizer
+	tokenizer.easyTokenizer = C.initTokenizer(C.CString(vocabPath), C.bool(tokenizer.doLowerCase), C.bool(tokenizer.codePointLevel))
+	return &tokenizer
+}
+
+func (t *EasyTokenizer) Close() {
+	C.free(unsafe.Pointer(t.easyTokenizer))
+	t.easyTokenizer = nil
+}
+
+func (t *EasyTokenizer) Encode(text string, maxSeqLength int) []int32 {
+	// text
+	cText := C.CString(text)
+	defer C.free(unsafe.Pointer(cText))
+	// make allocation of []int32 slice
+	outputData := make([]int32, maxSeqLength)
+	// call function
+	C.encode(t.easyTokenizer, cText, C.bool(true), C.bool(true), C.int(maxSeqLength), (*C.int)(unsafe.Pointer(&outputData[0])))
+	return outputData
+}
+
+func (t *EasyTokenizer) EncodeWithIds(text string, maxSeqLength int) ([]int32, []int32, []int32, []int32) {
+	// text
+	cText := C.CString(text)
+	defer C.free(unsafe.Pointer(cText))
+	// inputIds, tokenTypeIds, attentionMask, offsets
+	var inputIds, tokenTypeIds, attentionMask, offsets *C.int
+	// slice number of inputIds, tokenTypeIds, attentionMask, offsets
+	var numInputIds, numTokenTypeIds, numAttentionMask, numOffsets C.int
+	// call function
+	C.encodeWithIds(t.easyTokenizer, cText,
+		&inputIds, &numInputIds,
+		&tokenTypeIds, &numTokenTypeIds,
+		&attentionMask, &numAttentionMask,
+		&offsets, &numOffsets,
+		C.bool(true), C.bool(true), C.bool(true), C.int(maxSeqLength))
+	// to Golang Slice
+	sliceInputIds := (*[1 << 30]int32)(unsafe.Pointer(inputIds))[:numInputIds:numInputIds]
+	sliceTokenTypeIds := (*[1 << 30]int32)(unsafe.Pointer(tokenTypeIds))[:numTokenTypeIds:numTokenTypeIds]
+	sliceAttentionMask := (*[1 << 30]int32)(unsafe.Pointer(attentionMask))[:numAttentionMask:numAttentionMask]
+	sliceOffsets := (*[1 << 30]int32)(unsafe.Pointer(offsets))[:numOffsets:numOffsets]
+	// release
+	defer C.free(unsafe.Pointer(inputIds))
+	defer C.free(unsafe.Pointer(tokenTypeIds))
+	defer C.free(unsafe.Pointer(attentionMask))
+	defer C.free(unsafe.Pointer(offsets))
+	return sliceInputIds, sliceTokenTypeIds, sliceAttentionMask, sliceOffsets
+}
+
+func (t *EasyTokenizer) WordPieceTokenize(text string) ([]string, []int32) {
+	// text
+	cText := C.CString(text)
+	defer C.free(unsafe.Pointer(cText))
+	// init variable
+	var tokens **C.char
+	var offsets *C.int
+	var numTokens, numOffsets C.int
+	// call function
+	C.wordPieceTokenize(t.easyTokenizer, cText, &tokens, &numTokens, &offsets, &numOffsets)
+	defer C.free(unsafe.Pointer(tokens))
+	defer C.free(unsafe.Pointer(offsets))
+	// to Golang Slice
+	sliceTokens := (*[1 << 30]*C.char)(unsafe.Pointer(tokens))[:numTokens:numTokens]
+	sliceOffsets := (*[1 << 30]int32)(unsafe.Pointer(offsets))[:numOffsets:numOffsets]
+	// parse string token
+	goTokens := make([]string, numTokens)
+	for i := 0; i < int(numTokens); i++ {
+		goTokens[i] = C.GoString(sliceTokens[i])
+	}
+	sliceTokens = nil
+	return goTokens, sliceOffsets
+}
diff --git a/easytokenizer_wrapper.cc b/easytokenizer_wrapper.cc
@@ -0,0 +1,84 @@
+//
+// Created by sunhailin-Leo on 2023/3/6.
+//
+#include <string>
+#include <vector>
+#include "easytokenizer_wrapper.h"
+#include "tokenizer/tokenizer.h"
+
+EasyTokenizer initTokenizer(char* vocab_path, bool do_lower_case, bool codepoint_level) {
+    tokenizer::Tokenizer *tk = new tokenizer::Tokenizer(vocab_path, do_lower_case, codepoint_level);
+    return (void *) tk;
+}
+
+void encode(EasyTokenizer tokenizer, char* text, bool add_cls_sep, bool truncation, int max_length, int* output_data) {
+    auto data = ((tokenizer::Tokenizer *) tokenizer)->encode(text, add_cls_sep, truncation, max_length);
+
+    std::vector<int>* vec = new std::vector<int>(data.begin(), data.end());
+    int size = vec->size();
+    for (int i = 0; i < size; i++) {
+        output_data[i] = (*vec)[i];
+    }
+
+    delete vec;
+}
+
+void encodeWithIds(
+        EasyTokenizer tokenizer,
+        char* text,
+        int** input_ids,
+        int* num_input_ids,
+        int** token_type_ids,
+        int* num_token_type_ids,
+        int** attention_mask,
+        int* num_attention_mask,
+        int** offsets,
+        int* num_offsets,
+        bool add_cls_sep,
+        bool padding,
+        bool truncation,
+        int max_length) {
+    std::vector<tokenizer::SizeT> input_ids_vector;
+    std::vector<tokenizer::SizeT> token_type_ids_vector;
+    std::vector<tokenizer::SizeT> attention_mask_vector;
+    std::vector<tokenizer::SizeT> offsets_vector;
+
+    ((tokenizer::Tokenizer *) tokenizer)->encodeWithIds(text, input_ids_vector, token_type_ids_vector, attention_mask_vector, offsets_vector, add_cls_sep, padding, truncation, max_length);
+
+    // input_ids_vector
+    *num_input_ids = input_ids_vector.size();
+    *input_ids = (int*) malloc(sizeof(int) * (*num_input_ids));
+    memcpy(*input_ids, input_ids_vector.data(), sizeof(int) * (*num_input_ids));
+
+    // token_type_ids_vector
+    *num_token_type_ids = token_type_ids_vector.size();
+    *token_type_ids = (int*) malloc(sizeof(int) * (*num_token_type_ids));
+    memcpy(*token_type_ids, token_type_ids_vector.data(), sizeof(int) * (*num_token_type_ids));
+
+    // attention_mask_vector
+    *num_attention_mask = attention_mask_vector.size();
+    *attention_mask = (int*) malloc(sizeof(int) * (*num_attention_mask));
+    memcpy(*attention_mask, attention_mask_vector.data(), sizeof(int) * (*num_attention_mask));
+
+    // offsets_vector
+    *num_offsets = offsets_vector.size();
+    *offsets = (int*) malloc(sizeof(int) * (*num_offsets));
+    memcpy(*offsets, offsets_vector.data(), sizeof(int) * (*num_offsets));
+}
+
+void wordPieceTokenize(EasyTokenizer tokenizer, char* text, char*** tokens, int* num_tokens, int** offsets, int* num_offsets) {
+    std::vector<std::string> token_vector;
+    std::vector<tokenizer::SizeT> offset_vector;
+    ((tokenizer::Tokenizer *) tokenizer)->wordpiece_tokenize_with_offsets(text, token_vector, offset_vector);
+
+    *num_tokens = token_vector.size();
+    *tokens = (char**)malloc(sizeof(char*) * (*num_tokens));
+    for (int i = 0; i < *num_tokens; i++) {
+        (*tokens)[i] = (char*)malloc(token_vector[i].size() + 1);
+        strcpy((*tokens)[i], token_vector[i].c_str());
+    }
+
+    *num_offsets = offset_vector.size();
+    *offsets = (int*)malloc(sizeof(int) * (*num_offsets));
+    memcpy(*offsets, offset_vector.data(), sizeof(int) * (*num_offsets));
+}
diff --git a/easytokenizer_wrapper.h b/easytokenizer_wrapper.h
@@ -0,0 +1,35 @@
+//
+// Created by sunhailin-Leo on 2023/3/6.
+//
+#include <stdlib.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef void *EasyTokenizer;
+
+EasyTokenizer initTokenizer(char* vocab_path, bool do_lower_case, bool codepoint_level);
+
+void encode(EasyTokenizer tokenizer, char* text, bool add_cls_sep, bool truncation, int max_length, int* output_data);
+
+void encodeWithIds(
+        EasyTokenizer tokenizer,
+        char* text,
+        int** input_ids,
+        int* num_input_ids,
+        int** token_type_ids,
+        int* num_token_type_ids,
+        int** attention_mask,
+        int* num_attention_mask,
+        int** offsets,
+        int* num_offsets,
+        bool add_cls_sep,
+        bool padding,
+        bool truncation,
+        int max_length);
+
+void wordPieceTokenize(EasyTokenizer tokenizer, char* text, char*** tokens, int* num_tokens, int** offsets, int* num_offsets);
+
+#ifdef __cplusplus
+}
+#endif