Skip to content

Commit 322caaf

Browse files
authored
feat: support file params in analyzer and set jieba dict file (#45206)
relate: #43687 Support use user provice file by file params, in analyzer params. Could use local file or remote file resource. Support use file params in jieba extern dict. Signed-off-by: aoiasd <[email protected]>
1 parent 5e6fdf3 commit 322caaf

File tree

8 files changed

+156
-23
lines changed

8 files changed

+156
-23
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ proxy/suvlim/*
4747
proxy-go/proxy-go
4848

4949
# Compiled source
50+
target/
5051
bin/
5152
lib/
5253
*.a

internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@ mod analyzer;
22
mod build_in_analyzer;
33
mod dict;
44
mod filter;
5-
mod runtime_option;
5+
mod options;
66

77
pub mod tokenizers;
88
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
9-
pub use self::runtime_option::set_options;
9+
pub use self::options::set_options;
1010

1111
pub(crate) use self::build_in_analyzer::standard_analyzer;
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
mod runtime_option;
2+
mod util;
3+
4+
pub use self::runtime_option::{get_lindera_download_url, get_options, set_options};
5+
6+
pub use self::util::get_resource_path;
7+
8+
pub use self::runtime_option::DEFAULT_DICT_PATH_KEY;

internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/runtime_option.rs renamed to internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/runtime_option.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use crate::error::{Result, TantivyBindingError};
22
use once_cell::sync::Lazy;
33
use serde_json as json;
44
use std::collections::HashMap;
5+
use std::path::PathBuf;
56
use std::sync::{Arc, RwLock};
67

78
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
@@ -26,8 +27,8 @@ pub fn get_lindera_download_url(kind: &str) -> Option<Vec<String>> {
2627
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
2728
}
2829

29-
pub fn get_resource_id(name: &str) -> Option<i64> {
30-
GLOBAL_OPTIONS.get_resource_id(name)
30+
pub fn get_resource_file_path(resource_name: &str, file_name: &str) -> Result<PathBuf> {
31+
GLOBAL_OPTIONS.get_resource_file_path(resource_name, file_name)
3132
}
3233

3334
// analyzer options
@@ -57,9 +58,28 @@ impl RuntimeOption {
5758
r.lindera_download_urls.get(kind).map(|v| v.clone())
5859
}
5960

60-
fn get_resource_id(&self, name: &str) -> Option<i64> {
61+
fn get_resource_file_path(&self, resource_name: &str, file_name: &str) -> Result<PathBuf> {
6162
let r = self.inner.read().unwrap();
62-
r.resource_map.get(name).cloned()
63+
let resource_id =
64+
r.resource_map
65+
.get(resource_name)
66+
.ok_or(TantivyBindingError::InternalError(format!(
67+
"file resource: {} not found in local resource list",
68+
resource_name
69+
)))?;
70+
let base = r
71+
.params
72+
.get(RESOURCE_PATH_KEY)
73+
.ok_or(TantivyBindingError::InternalError(
74+
"local_resource_path config not init success".to_string(),
75+
))?
76+
.as_str()
77+
.ok_or("local_resource_path must set as string")?;
78+
79+
return Ok(PathBuf::new()
80+
.join(base)
81+
.join(resource_id.to_string())
82+
.join(file_name));
6383
}
6484
}
6585

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
use serde_json as json;
2+
use std::path::{Path, PathBuf};
3+
4+
use super::runtime_option::get_resource_file_path;
5+
use crate::error::{Result, TantivyBindingError};
6+
7+
pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf> {
8+
if !v.is_object() {
9+
return Err(TantivyBindingError::InvalidArgument(format!(
10+
"file config of {} must be object",
11+
resource_key
12+
)));
13+
}
14+
15+
let params = v.as_object().unwrap();
16+
17+
let file_type = params.get("type").ok_or_else(|| {
18+
TantivyBindingError::InvalidArgument(format!("file type of {} must be set", resource_key))
19+
})?;
20+
21+
if !file_type.is_string() {
22+
return Err(TantivyBindingError::InvalidArgument(format!(
23+
"file type of {} must be string",
24+
resource_key
25+
)));
26+
}
27+
28+
match file_type.as_str().unwrap() {
29+
"local" => {
30+
let path = params.get("path").ok_or_else(|| {
31+
TantivyBindingError::InvalidArgument(format!(
32+
"file path of local file `{}` must be set",
33+
resource_key
34+
))
35+
})?;
36+
37+
if !path.is_string() {
38+
return Err(TantivyBindingError::InvalidArgument(format!(
39+
"file path of local file `{}` must be string",
40+
resource_key
41+
)));
42+
}
43+
44+
let path_str = path.as_str().unwrap();
45+
Ok(Path::new(path_str).to_path_buf())
46+
}
47+
"remote" => {
48+
let resource_name = params
49+
.get("resource_name")
50+
.ok_or_else(|| {
51+
TantivyBindingError::InvalidArgument(format!(
52+
"resource name of remote file `{}` must be set",
53+
resource_key
54+
))
55+
})?
56+
.as_str()
57+
.ok_or(TantivyBindingError::InvalidArgument(format!(
58+
"remote file resource name of remote file `{}` must be string",
59+
resource_key
60+
)))?;
61+
62+
let file_name = params
63+
.get("file_name")
64+
.ok_or_else(|| {
65+
TantivyBindingError::InvalidArgument(format!(
66+
"file name of remote file `{}` must be set",
67+
resource_key
68+
))
69+
})?
70+
.as_str()
71+
.ok_or(TantivyBindingError::InvalidArgument(format!(
72+
"remote file resource name of {} must be string",
73+
resource_key
74+
)))?;
75+
76+
self::get_resource_file_path(resource_name, file_name)
77+
}
78+
other => Err(TantivyBindingError::InvalidArgument(format!(
79+
"unsupported file type {} of {}",
80+
other, resource_key
81+
))),
82+
}
83+
}

internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ use core::{option::Option::Some, result::Result::Ok};
22
use jieba_rs;
33
use lazy_static::lazy_static;
44
use serde_json as json;
5-
use std::borrow::Cow;
5+
use std::fs;
66
use std::io::BufReader;
7+
use std::{borrow::Cow, path::PathBuf};
78
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
89

10+
use crate::analyzer::options;
911
use crate::error::{Result, TantivyBindingError};
1012

1113
lazy_static! {
@@ -54,16 +56,19 @@ impl TokenStream for JiebaTokenStream {
5456

5557
fn get_jieba_dict(
5658
params: &json::Map<String, json::Value>,
57-
) -> Result<(Vec<String>, Option<String>)> {
59+
) -> Result<(Vec<String>, Option<String>, Option<PathBuf>)> {
60+
let mut words = Vec::<String>::new();
61+
let mut user_dict = None;
62+
// use default dict as default system dict
63+
let mut system_dict = Some("_default_".to_string());
5864
match params.get("dict") {
5965
Some(value) => {
66+
system_dict = None;
6067
if !value.is_array() {
6168
return Err(TantivyBindingError::InvalidArgument(format!(
6269
"jieba tokenizer dict must be array"
6370
)));
6471
}
65-
let mut dict = Vec::<String>::new();
66-
let mut system_dict = None;
6772

6873
for word in value.as_array().unwrap() {
6974
if !word.is_string() {
@@ -82,18 +87,27 @@ fn get_jieba_dict(
8287
if text == "_default_" || text == "_extend_default_" {
8388
if system_dict.is_some() {
8489
return Err(TantivyBindingError::InvalidArgument(format!(
85-
"jieba tokenizer dict can only set one default dict"
90+
"jieba tokenizer dict can only set one system dict"
8691
)));
8792
}
8893
system_dict = Some(text)
8994
} else {
90-
dict.push(text);
95+
words.push(text);
9196
}
9297
}
93-
Ok((dict, system_dict))
9498
}
95-
_ => Ok((vec![], Some("_default_".to_string()))),
96-
}
99+
_ => {}
100+
};
101+
102+
match params.get("extra_dict_file") {
103+
Some(v) => {
104+
let path = options::get_resource_path(v, "jieba extra dict file")?;
105+
user_dict = Some(path)
106+
}
107+
_ => {}
108+
};
109+
110+
Ok((words, system_dict, user_dict))
97111
}
98112

99113
fn get_jieba_mode(params: &json::Map<String, json::Value>) -> Result<JiebaMode> {
@@ -143,7 +157,7 @@ impl<'a> JiebaTokenizer<'a> {
143157
}
144158

145159
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
146-
let (dict, system_dict) = get_jieba_dict(params)?;
160+
let (words, system_dict, user_dict) = get_jieba_dict(params)?;
147161

148162
let mut tokenizer =
149163
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
@@ -163,10 +177,21 @@ impl<'a> JiebaTokenizer<'a> {
163177
))),
164178
})?;
165179

166-
for word in dict {
180+
for word in words {
167181
tokenizer.add_word(word.as_str(), None, None);
168182
}
169183

184+
if user_dict.is_some() {
185+
let file = fs::File::open(user_dict.unwrap())?;
186+
let mut reader = BufReader::new(file);
187+
tokenizer.load_dict(&mut reader).map_err(|e| {
188+
TantivyBindingError::InvalidArgument(format!(
189+
"jieba tokenizer load dict file failed with error: {:?}",
190+
e
191+
))
192+
})?;
193+
}
194+
170195
let mode = get_jieba_mode(params)?;
171196
let hmm = get_jieba_hmm(params)?;
172197

internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ use lindera::mode::Mode;
66
use lindera::segmenter::Segmenter;
77
use lindera::token::Token as LToken;
88
use lindera::tokenizer::Tokenizer as LTokenizer;
9-
use log::warn;
109
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
1110

1211
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
@@ -17,9 +16,7 @@ use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
1716
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
1817

1918
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
20-
use crate::analyzer::runtime_option::{
21-
get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY,
22-
};
19+
use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY};
2320
use crate::error::{Result, TantivyBindingError};
2421
use serde_json as json;
2522

internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use core::slice;
21
use std::ffi::{c_char, c_void, CStr};
32

43
use crate::{
@@ -18,7 +17,7 @@ macro_rules! convert_to_rust_slice {
1817
match $arr {
1918
// there is a UB in slice::from_raw_parts if the pointer is null
2019
x if x.is_null() => &[],
21-
_ => slice::from_raw_parts($arr, $len),
20+
_ => ::core::slice::from_raw_parts($arr, $len),
2221
}
2322
};
2423
}

0 commit comments

Comments
 (0)