Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions .github/workflows/python-upload-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,19 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, ubuntu-24.04-arm, windows-latest, macOS-latest]
target: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
target: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
include:
- os: "ubuntu-latest"
target: "sdist"
python-version: "3.13"
- os: "ubuntu-latest"
target: "sdist"
python-version: "3.14"
exclude:
- os: "windows-latest"
target: "3.13t"
- os: "windows-latest"
target: "3.14t"

runs-on: ${{ matrix.os }}
steps:
Expand All @@ -136,17 +141,17 @@ jobs:
# this must be after sudachipy install
run: python -m pip install sudachidict_core
- name: Install dependencies (test pretokenizer)
# tokenizers for py3.13t is not provided yet
if: ${{ matrix.target != '3.13t' }}
# tokenizers for py3.13t, py3.14, py3.14t are not provided yet
if: ${{ matrix.target != '3.13t' && matrix.target != '3.14' && matrix.target != '3.14t' }}
run: python -m pip install tokenizers

- name: Run test
if: ${{ matrix.target != '3.13t' }}
if: ${{ matrix.target != '3.13t' && matrix.target != '3.14' && matrix.target != '3.14t' }}
working-directory: ./python
run: python -m unittest
- name: Run test (skip pretokenizer test)
# tokenizers for py3.13t is not provided yet
if: ${{ matrix.target == '3.13t' }}
# tokenizers for py3.13t, py3.14, py3.14t are not provided yet
if: ${{ matrix.target == '3.13t' || matrix.target == '3.14' || matrix.target == '3.14t' }}
working-directory: ./python
run: ls tests/test_*.py | grep -v pretokenizer | xargs -I{} python -m unittest {}
- name: Check that binary works (C mode)
Expand Down
26 changes: 12 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ name = "sudachipy"
crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.23", features = ["extension-module"] }
pyo3 = { version = "0.27", features = ["extension-module"] }
scopeguard = "1" # Apache 2.0/MIT
thread_local = "1.1" # Apache 2.0/MIT

Expand Down
2 changes: 1 addition & 1 deletion python/py_src/sudachipy/sudachipy.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ class Morpheme:
Returns sub-morphemes in the provided split mode.

:param mode: mode of new split.
:param out: write results to this MorhpemeList instead of creating new one.
:param out: write results to this MorphemeList instead of creating new one.
See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
more information on output parameters.
Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
requires = ["setuptools", "wheel", "setuptools-rust"]

[tool.cibuildwheel]
build = "cp39-* cp310-* cp311-* cp312-* cp313-* cp313t-*"
build = "cp39-* cp310-* cp311-* cp312-* cp313-* cp313t-* cp314-* cp314t-*"
skip = "*t-win* *-win32 *-musllinux_*"
enable = ["cpython-freethreading"]

Expand Down
10 changes: 4 additions & 6 deletions python/src/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ fn create_file(p: &Path) -> std::io::Result<File> {
///
/// :param matrix: Path to the matrix file.
/// :param lex: List of paths to lexicon files.
/// :param output: Path to output built dictionray.
/// :param output: Path to output built dictionary.
/// :param description: A description text to embed in the dictionary.
/// :return: A build report, list of (part, size, time).
///
Expand Down Expand Up @@ -107,7 +107,7 @@ fn build_system_dic<'py>(
///
/// :param system: Path to the system dictionary.
/// :param lex: List of paths to lexicon files.
/// :param output: Path to output built dictionray.
/// :param output: Path to output built dictionary.
/// :param description: A description text to embed in the dictionary.
/// :return: A build report, list of (part, size, time).
///
Expand Down Expand Up @@ -168,7 +168,7 @@ fn resolve_as_pypathstr<'py>(
data: &Bound<'py, PyAny>,
) -> PyResult<Option<Bound<'py, PyString>>> {
let binding = py.import("pathlib")?.getattr("Path")?;
let path = binding.downcast::<PyType>()?;
let path = binding.cast::<PyType>()?;
if data.is_instance(path)? {
Ok(Some(data.call_method0("resolve")?.str()?))
} else if data.is_instance_of::<PyString>() {
Expand All @@ -186,9 +186,7 @@ fn as_data_source<'py>(
Some(pystr) => Ok(DataSource::File(Path::new(pystr.to_str()?))),
None => {
if original_obj.is_instance_of::<PyBytes>() {
Ok(DataSource::Data(
original_obj.downcast::<PyBytes>()?.as_bytes(),
))
Ok(DataSource::Data(original_obj.cast::<PyBytes>()?.as_bytes()))
} else {
errors::wrap(Err(format!(
"data source should be only Path, bytes or str, was {}: {}",
Expand Down
10 changes: 5 additions & 5 deletions python/src/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ pub(crate) struct PyDicData {
pub(crate) pos: Vec<Py<PyTuple>>,
/// Compute default string representation for a morpheme using vtable dispatch.
/// None by default (if outputting surface as it is)
/// This is default per-dictionary value, can be overriden when creating tokenizers and pre-tokenizers
/// This is default per-dictionary value, can be overridden when creating tokenizers and pre-tokenizers
pub(crate) projection: PyProjector,
}

Expand Down Expand Up @@ -430,7 +430,7 @@ impl PyDictionary {
///
/// :type pos_id: int
#[pyo3(text_signature = "(self, /, pos_id: int) -> tuple[str, str, str, str, str, str] | None")]
fn pos_of<'py>(&'py self, py: Python<'py>, pos_id: usize) -> Option<&Bound<'py, PyTuple>> {
fn pos_of<'py>(&'py self, py: Python<'py>, pos_id: usize) -> Option<&'py Bound<'py, PyTuple>> {
let dic = self.dictionary.as_ref().unwrap();
dic.pos.get(pos_id).map(|x| x.bind(py))
}
Expand Down Expand Up @@ -516,21 +516,21 @@ fn read_config(config_opt: &Bound<PyAny>) -> PyResult<ConfigBuilder> {

pub(crate) fn read_default_config(py: Python) -> PyResult<ConfigBuilder> {
let path = py.import("sudachipy")?.getattr("_DEFAULT_SETTINGFILE")?;
let path = path.downcast::<PyString>()?.to_str()?;
let path = path.cast::<PyString>()?.to_str()?;
let path = PathBuf::from(path);
errors::wrap_ctx(ConfigBuilder::from_opt_file(Some(&path)), &path)
}

pub(crate) fn get_default_resource_dir(py: Python) -> PyResult<PathBuf> {
let path = py.import("sudachipy")?.getattr("_DEFAULT_RESOURCEDIR")?;
let path = path.downcast::<PyString>()?.to_str()?;
let path = path.cast::<PyString>()?.to_str()?;
Ok(PathBuf::from(path))
}

fn find_dict_path(py: Python, dict_type: &str) -> PyResult<PathBuf> {
let pyfunc = py.import("sudachipy")?.getattr("_find_dict_path")?;
let path = pyfunc.call1((dict_type,))?;
let path = path.downcast::<PyString>()?.to_str()?;
let path = path.cast::<PyString>()?.to_str()?;
Ok(PathBuf::from(path))
}

Expand Down
10 changes: 5 additions & 5 deletions python/src/morpheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ impl PyMorpheme {

/// Returns the dictionary form.
#[pyo3(text_signature = "(self, /) -> str")]
fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<PyString>> {
fn dictionary_form<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyString>> {
Ok(self
.morph(py)
.get_word_info()
Expand All @@ -345,7 +345,7 @@ impl PyMorpheme {

/// Returns the normalized form.
#[pyo3(text_signature = "(self, /) -> str")]
fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<PyString>> {
fn normalized_form<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyString>> {
Ok(self
.morph(py)
.get_word_info()
Expand All @@ -355,7 +355,7 @@ impl PyMorpheme {

/// Returns the reading form.
#[pyo3(text_signature = "(self, /) -> str")]
fn reading_form<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<PyString>> {
fn reading_form<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyString>> {
Ok(self
.morph(py)
.get_word_info()
Expand All @@ -366,7 +366,7 @@ impl PyMorpheme {
/// Returns sub-morphemes in the provided split mode.
///
/// :param mode: mode of new split.
/// :param out: write results to this MorhpemeList instead of creating new one.
/// :param out: write results to this MorphemeList instead of creating new one.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for
/// more information on output parameters.
/// Returned MorphemeList will be invalidated if this MorphemeList is used as an output parameter.
Expand Down Expand Up @@ -444,7 +444,7 @@ impl PyMorpheme {

/// Returns the list of synonym group ids.
#[pyo3(text_signature = "(self, /) -> List[int]")]
fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<PyList>> {
fn synonym_group_ids<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {
let mref = self.morph(py);
let ids = mref.get_word_info().synonym_group_ids();
PyList::new(py, ids)
Expand Down
6 changes: 3 additions & 3 deletions python/src/pos_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ impl PyPosMatcher {
fn create_from_fn(dic: &Arc<PyDicData>, func: &Bound<PyAny>) -> PyResult<Self> {
let mut data = Vec::new();
for (pos_id, pos) in dic.pos.iter().enumerate() {
if func.call1((pos,))?.downcast::<PyBool>()?.is_true() {
if func.call1((pos,))?.cast::<PyBool>()?.is_true() {
data.push(pos_id as u16);
}
}
Expand All @@ -67,7 +67,7 @@ impl PyPosMatcher {
let mut result = Vec::new();
for item in data {
let item = item?;
let item = item.downcast::<PyTuple>()?;
let item = item.cast::<PyTuple>()?;
Self::match_pos_elements(&mut result, dic.as_ref(), item)?;
}
Ok(Self {
Expand Down Expand Up @@ -232,7 +232,7 @@ impl PyPosIter {
slf
}

fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&Bound<'py, PyTuple>> {
fn __next__<'py>(&'py mut self, py: Python<'py>) -> Option<&'py Bound<'py, PyTuple>> {
let idx = self.index;
self.index += 1;
if idx >= self.data.len() {
Expand Down
8 changes: 4 additions & 4 deletions python/src/pretokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use std::sync::Arc;

use pyo3::intern;
use pyo3::prelude::*;
use pyo3::sync::GILOnceCell;
use pyo3::sync::PyOnceLock;
use pyo3::types::{PyList, PySlice, PyType};
use thread_local::ThreadLocal;

Expand Down Expand Up @@ -138,7 +138,7 @@ impl PyPretokenizer {
let pystr = string.str()?;
let input_data = pystr.to_str()?;
// tokenization itself should work without GIL, we have thread-local tokenizers here
py.allow_threads(|| self.tokenizer_cell().borrow_mut().tokenize(input_data))?;
py.detach(|| self.tokenizer_cell().borrow_mut().tokenize(input_data))?;
// then prepare results with GIL
self.tokenizer_cell().borrow_mut().collect_results(py)?;
let cell = self.tokenizer_cell().borrow();
Expand Down Expand Up @@ -191,10 +191,10 @@ fn make_result_for_projection<'py>(
) -> PyResult<Bound<'py, PyList>> {
let result = PyList::empty(py);
let nstring = {
static NORMALIZED_STRING: GILOnceCell<Py<PyType>> = GILOnceCell::new();
static NORMALIZED_STRING: PyOnceLock<Py<PyType>> = PyOnceLock::new();
NORMALIZED_STRING.get_or_try_init(py, || -> PyResult<Py<PyType>> {
let ns = py.import("tokenizers")?.getattr("NormalizedString")?;
let tpe = ns.downcast::<PyType>()?;
let tpe = ns.cast::<PyType>()?;
Ok(tpe.clone().unbind())
})?
};
Expand Down
6 changes: 3 additions & 3 deletions python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,9 @@ impl PyTokenizer {
py: Python<'py>,
text: &'py str,
mode: Option<&Bound<'py, PyAny>>,
logger: Option<PyObject>,
logger: Option<Py<PyAny>>,
out: Option<Bound<'py, PyMorphemeListWrapper>>,
) -> PyResult<Bound<PyMorphemeListWrapper>> {
) -> PyResult<Bound<'py, PyMorphemeListWrapper>> {
// restore default mode on scope exit
let mode = match mode {
None => None,
Expand All @@ -164,7 +164,7 @@ impl PyTokenizer {

// analysis can be done without GIL
errors::wrap_ctx(
py.allow_threads(|| {
py.detach(|| {
tokenizer.reset().push_str(text);
tokenizer.do_tokenize()
}),
Expand Down
2 changes: 1 addition & 1 deletion sudachi-fuzz/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ fn consume_mlist<'a, 'b: 'a>(
}

// mlist.get_internal_cost() as isize;
// use black_box function to forbit optimizing accesses to API functions
// use black_box function to forbid optimizing accesses to API functions
// this is important for fuzzing, we want to trigger any possible panics that can happen
for i in 0..mlist.len() {
let m = mlist.get(i);
Expand Down
2 changes: 1 addition & 1 deletion sudachi/src/dic/lexicon/word_id_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub struct WordIdTable<'a> {
}

impl<'a> WordIdTable<'a> {
pub fn new(bytes: &'a [u8], size: u32, offset: usize) -> WordIdTable {
pub fn new(bytes: &'a [u8], size: u32, offset: usize) -> WordIdTable<'a> {
WordIdTable {
bytes,
size,
Expand Down
2 changes: 1 addition & 1 deletion sudachi/src/dic/lexicon/word_infos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ impl<'a> WordInfos<'a> {
offset: usize,
_word_size: u32,
has_synonym_group_ids: bool,
) -> WordInfos {
) -> WordInfos<'a> {
WordInfos {
bytes,
offset,
Expand Down
2 changes: 1 addition & 1 deletion sudachi/src/dic/lexicon/word_params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl<'a> WordParams<'a> {
const PARAM_SIZE: usize = 3;
const ELEMENT_SIZE: usize = 2 * Self::PARAM_SIZE;

pub fn new(bytes: &'a [u8], size: u32, offset: usize) -> WordParams {
pub fn new(bytes: &'a [u8], size: u32, offset: usize) -> WordParams<'a> {
let n_entries = size as usize * Self::PARAM_SIZE;
Self {
data: CowArray::from_bytes(bytes, offset, n_entries),
Expand Down
2 changes: 1 addition & 1 deletion sudachi/src/dic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ impl<'a> LoadedDictionary<'a> {
pub(crate) fn merge_dictionary(
mut self,
other: DictionaryLoader<'a>,
) -> SudachiResult<LoadedDictionary> {
) -> SudachiResult<LoadedDictionary<'a>> {
let npos = self.grammar.pos_list.len();
let lexicon = other.lexicon;
let grammar = other.grammar;
Expand Down
Loading