Skip to content
This repository was archived by the owner on Feb 12, 2026. It is now read-only.

Commit 54ca146

Browse files
committed
Merge branch 'main' into bump
2 parents 68d2432 + ae0b7a8 commit 54ca146

File tree

10 files changed

+570
-2
lines changed

10 files changed

+570
-2
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ serde_json = "1.0.145"
4545
num_cpus = "1.17.0"
4646

4747
lindera = "1.4.2"
48+
lindera = "1.4.2"

src/dictionary.rs

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,29 @@
1+
//! Dictionary management for morphological analysis.
2+
//!
3+
//! This module provides functionality for building, loading, and managing dictionaries
4+
//! used in morphological analysis.
5+
//!
6+
//! # Dictionary Types
7+
//!
8+
//! - **Dictionary**: Main dictionary for morphological analysis
9+
//! - **UserDictionary**: Custom user-defined dictionary for additional words
10+
//!
11+
//! # Examples
12+
//!
13+
//! ```python
14+
//! import lindera
15+
//!
16+
//! # Load a pre-built dictionary
17+
//! dictionary = lindera.load_dictionary("ipadic")
18+
//!
19+
//! # Build a custom dictionary
20+
//! metadata = lindera.Metadata()
21+
//! lindera.build_dictionary("/path/to/input", "/path/to/output", metadata)
22+
//!
23+
//! # Build a user dictionary
24+
//! lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
25+
//! ```
26+
127
use std::path::Path;
228

329
use pyo3::{exceptions::PyValueError, prelude::*};
@@ -10,6 +36,20 @@ use lindera::dictionary::{
1036

1137
use crate::metadata::PyMetadata;
1238

39+
/// A morphological analysis dictionary.
40+
///
41+
/// Contains the data structures needed for tokenization and morphological analysis.
42+
///
43+
/// # Examples
44+
///
45+
/// ```python
46+
/// # Load a dictionary
47+
/// dictionary = lindera.load_dictionary("ipadic")
48+
///
49+
/// # Access metadata
50+
/// print(dictionary.metadata_name())
51+
/// print(dictionary.metadata_encoding())
52+
/// ```
1353
#[pyclass(name = "Dictionary")]
1454
#[derive(Clone)]
1555
pub struct PyDictionary {
@@ -18,14 +58,17 @@ pub struct PyDictionary {
1858

1959
#[pymethods]
2060
impl PyDictionary {
61+
/// Returns the name of the dictionary metadata.
2162
pub fn metadata_name(&self) -> String {
2263
self.inner.metadata.name.clone()
2364
}
2465

66+
/// Returns the character encoding of the dictionary.
2567
pub fn metadata_encoding(&self) -> String {
2668
self.inner.metadata.encoding.clone()
2769
}
2870

71+
/// Returns the full metadata object of the dictionary.
2972
pub fn metadata(&self) -> PyMetadata {
3073
PyMetadata::from(self.inner.metadata.clone())
3174
}
@@ -46,6 +89,21 @@ impl PyDictionary {
4689
}
4790
}
4891

92+
/// A user-defined dictionary for custom words.
93+
///
94+
/// User dictionaries allow you to add custom words and their morphological features
95+
/// that are not present in the main dictionary.
96+
///
97+
/// # Examples
98+
///
99+
/// ```python
100+
/// # Build a user dictionary
101+
/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
102+
///
103+
/// # Load it
104+
/// metadata = lindera.Metadata()
105+
/// user_dict = lindera.load_user_dictionary("/path/to/output", metadata)
106+
/// ```
49107
#[pyclass(name = "UserDictionary")]
50108
#[derive(Clone)]
51109
pub struct PyUserDictionary {
@@ -72,6 +130,24 @@ impl PyUserDictionary {
72130
}
73131
}
74132

133+
/// Builds a dictionary from source files.
134+
///
135+
/// # Arguments
136+
///
137+
/// * `input_dir` - Directory containing dictionary source files.
138+
/// * `output_dir` - Directory where the built dictionary will be saved.
139+
/// * `metadata` - Metadata configuration for the dictionary.
140+
///
141+
/// # Errors
142+
///
143+
/// Returns an error if the input directory doesn't exist or if the build fails.
144+
///
145+
/// # Examples
146+
///
147+
/// ```python
148+
/// metadata = lindera.Metadata(name="custom", encoding="UTF-8")
149+
/// lindera.build_dictionary("/path/to/input", "/path/to/output", metadata)
150+
/// ```
75151
#[pyfunction]
76152
#[pyo3(signature = (input_dir, output_dir, metadata))]
77153
pub fn build_dictionary(input_dir: &str, output_dir: &str, metadata: PyMetadata) -> PyResult<()> {
@@ -93,6 +169,34 @@ pub fn build_dictionary(input_dir: &str, output_dir: &str, metadata: PyMetadata)
93169
Ok(())
94170
}
95171

172+
/// Builds a user dictionary from a CSV file.
173+
///
174+
/// # Arguments
175+
///
176+
/// * `_kind` - Dictionary kind (currently unused, reserved for future use).
177+
/// * `input_file` - Path to the CSV file containing user dictionary entries.
178+
/// * `output_dir` - Directory where the built user dictionary will be saved.
179+
/// * `metadata` - Optional metadata configuration. If None, default values are used.
180+
///
181+
/// # CSV Format
182+
///
183+
/// The CSV file should contain entries in the format specified by the dictionary schema.
184+
/// Typically: surface,reading,pronunciation
185+
///
186+
/// # Errors
187+
///
188+
/// Returns an error if the input file doesn't exist or if the build fails.
189+
///
190+
/// # Examples
191+
///
192+
/// ```python
193+
/// # Build with default metadata
194+
/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output")
195+
///
196+
/// # Build with custom metadata
197+
/// metadata = lindera.Metadata()
198+
/// lindera.build_user_dictionary("ipadic", "user.csv", "/path/to/output", metadata)
199+
/// ```
96200
#[pyfunction]
97201
#[pyo3(signature = (_kind, input_file, output_dir, metadata=None))]
98202
pub fn build_user_dictionary(
@@ -129,6 +233,34 @@ pub fn build_user_dictionary(
129233
Ok(())
130234
}
131235

236+
/// Loads a dictionary from the specified URI.
237+
///
238+
/// # Arguments
239+
///
240+
/// * `uri` - URI to the dictionary. Can be a file path or embedded dictionary name.
241+
///
242+
/// # Supported URIs
243+
///
244+
/// - File paths: `/path/to/dictionary`
245+
/// - Embedded dictionaries: `ipadic`, `unidic`, `ko-dic`, `cc-cedict`
246+
///
247+
/// # Returns
248+
///
249+
/// A loaded `Dictionary` object.
250+
///
251+
/// # Errors
252+
///
253+
/// Returns an error if the dictionary cannot be loaded from the specified URI.
254+
///
255+
/// # Examples
256+
///
257+
/// ```python
258+
/// # Load an embedded dictionary
259+
/// dict = lindera.load_dictionary("ipadic")
260+
///
261+
/// # Load from file path
262+
/// dict = lindera.load_dictionary("/path/to/dictionary")
263+
/// ```
132264
#[pyfunction]
133265
#[pyo3(signature = (uri))]
134266
pub fn load_dictionary(uri: &str) -> PyResult<PyDictionary> {
@@ -137,6 +269,27 @@ pub fn load_dictionary(uri: &str) -> PyResult<PyDictionary> {
137269
.map(PyDictionary::new)
138270
}
139271

272+
/// Loads a user dictionary from the specified URI.
273+
///
274+
/// # Arguments
275+
///
276+
/// * `uri` - URI to the user dictionary directory.
277+
/// * `metadata` - Metadata configuration for the user dictionary.
278+
///
279+
/// # Returns
280+
///
281+
/// A loaded `UserDictionary` object.
282+
///
283+
/// # Errors
284+
///
285+
/// Returns an error if the user dictionary cannot be loaded.
286+
///
287+
/// # Examples
288+
///
289+
/// ```python
290+
/// metadata = lindera.Metadata()
291+
/// user_dict = lindera.load_user_dictionary("/path/to/user_dict", metadata)
292+
/// ```
140293
#[pyfunction]
141294
#[pyo3(signature = (uri, metadata))]
142295
pub fn load_user_dictionary(uri: &str, metadata: PyMetadata) -> PyResult<PyUserDictionary> {

src/error.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
1+
//! Error types for Lindera operations.
2+
//!
3+
//! This module provides error types used throughout the Lindera Python bindings.
4+
15
use std::fmt;
26

37
use pyo3::exceptions::PyException;
48
use pyo3::prelude::*;
59

10+
/// Error type for Lindera operations.
11+
///
12+
/// Represents errors that can occur during tokenization, dictionary operations,
13+
/// or other Lindera functionality.
614
#[pyclass(name = "LinderaError")]
715
#[derive(Debug, Clone)]
816
pub struct PyLinderaError {

src/lib.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,34 @@
1+
//! # Lindera Python Bindings
2+
//!
3+
//! Python bindings for [Lindera](https://github.com/lindera/lindera), a morphological analysis library for CJK text.
4+
//!
5+
//! Lindera provides high-performance tokenization and morphological analysis for:
6+
//! - Japanese (IPADIC, IPADIC NEologd, UniDic)
7+
//! - Korean (ko-dic)
8+
//! - Chinese (CC-CEDICT)
9+
//!
10+
//! ## Features
11+
//!
12+
//! - **Dictionary management**: Build, load, and use custom dictionaries
13+
//! - **Tokenization**: Multiple tokenization modes (normal, decompose)
14+
//! - **Filters**: Character and token filtering pipeline
15+
//! - **Training**: Train custom morphological models (with `train` feature)
16+
//! - **User dictionaries**: Support for custom user dictionaries
17+
//!
18+
//! ## Examples
19+
//!
20+
//! ```python
21+
//! import lindera
22+
//!
23+
//! # Create a tokenizer
24+
//! tokenizer = lindera.TokenizerBuilder().build()
25+
//!
26+
//! # Tokenize text
27+
//! tokens = tokenizer.tokenize("関西国際空港")
28+
//! for token in tokens:
29+
//! print(token["text"], token["detail"])
30+
//! ```
31+
132
pub mod dictionary;
233
pub mod error;
334
pub mod metadata;
@@ -17,11 +48,19 @@ use crate::mode::{PyMode, PyPenalty};
1748
use crate::schema::{PyFieldDefinition, PyFieldType, PySchema};
1849
use crate::tokenizer::{PyTokenizer, PyTokenizerBuilder};
1950

51+
/// Returns the version of the lindera-python package.
52+
///
53+
/// # Returns
54+
///
55+
/// Version string in the format "major.minor.patch"
2056
#[pyfunction]
2157
pub fn version() -> String {
2258
env!("CARGO_PKG_VERSION").to_string()
2359
}
2460

61+
/// Python module definition for lindera.
62+
///
63+
/// This module exports all classes and functions available to Python code.
2564
#[pymodule]
2665
fn lindera(module: &Bound<'_, PyModule>) -> PyResult<()> {
2766
module.add_class::<PyDictionary>()?;

src/metadata.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
1+
//! Dictionary metadata configuration.
2+
//!
3+
//! This module provides structures for configuring dictionary metadata, including
4+
//! compression algorithms, character encodings, and schema definitions.
5+
//!
6+
//! # Examples
7+
//!
8+
//! ```python
9+
//! # Create metadata with default values
10+
//! metadata = lindera.Metadata()
11+
//!
12+
//! # Create metadata with custom values
13+
//! metadata = lindera.Metadata(
14+
//! name="custom_dict",
15+
//! encoding="UTF-8",
16+
//! compress_algorithm=lindera.CompressionAlgorithm.Deflate
17+
//! )
18+
//!
19+
//! # Load metadata from JSON
20+
//! metadata = lindera.Metadata.from_json_file("metadata.json")
21+
//! ```
22+
123
use std::collections::HashMap;
224

325
use pyo3::prelude::*;
@@ -6,12 +28,19 @@ use lindera::dictionary::{CompressionAlgorithm, Metadata};
628

729
use crate::schema::PySchema;
830

31+
/// Compression algorithm for dictionary data.
32+
///
33+
/// Determines how dictionary data is compressed when saved to disk.
934
#[pyclass(name = "CompressionAlgorithm")]
1035
#[derive(Debug, Clone)]
1136
pub enum PyCompressionAlgorithm {
37+
/// DEFLATE compression algorithm
1238
Deflate,
39+
/// Zlib compression algorithm
1340
Zlib,
41+
/// Gzip compression algorithm
1442
Gzip,
43+
/// No compression (raw data)
1544
Raw,
1645
}
1746

@@ -53,6 +82,24 @@ impl From<CompressionAlgorithm> for PyCompressionAlgorithm {
5382
}
5483
}
5584

85+
/// Dictionary metadata configuration.
86+
///
87+
/// Contains all configuration parameters for building and using dictionaries.
88+
///
89+
/// # Fields
90+
///
91+
/// * `name` - Dictionary name
92+
/// * `encoding` - Character encoding (default: "UTF-8")
93+
/// * `compress_algorithm` - Compression algorithm (default: Deflate)
94+
/// * `default_word_cost` - Default cost for unknown words (default: -10000)
95+
/// * `default_left_context_id` - Default left context ID (default: 1288)
96+
/// * `default_right_context_id` - Default right context ID (default: 1288)
97+
/// * `default_field_value` - Default value for missing fields (default: "*")
98+
/// * `flexible_csv` - Allow flexible CSV parsing (default: false)
99+
/// * `skip_invalid_cost_or_id` - Skip entries with invalid cost/ID (default: false)
100+
/// * `normalize_details` - Normalize morphological details (default: false)
101+
/// * `dictionary_schema` - Schema for main dictionary
102+
/// * `user_dictionary_schema` - Schema for user dictionary
56103
#[pyclass(name = "Metadata")]
57104
#[derive(Debug, Clone)]
58105
pub struct PyMetadata {

0 commit comments

Comments
 (0)