fix: upgrading to the latest versions

weso · Dec 23, 2024 · 7ddb59d · 7ddb59d
1 parent 3b6faf4
commit 7ddb59d
Show file tree

Hide file tree

Showing 9 changed files with 122 additions and 152 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "pschema-rs"
-version = "0.0.3"
-authors = [ "Ángel Iglesias Préstamo <[email protected]>" ]
+version = "0.0.4"
+authors = ["Ángel Iglesias Préstamo <[email protected]>"]
 description = "Pregel-based schema validation algorithm written in Rust for generating Wikidata subsets"
 documentation = "https://docs.rs/crate/pschema-rs/latest"
 repository = "https://github.com/angelip2303/pschema-rs"
@@ -12,14 +12,24 @@ keywords = ["pregel", "wikidata", "subsetting", "duckdb", "validation"]
 categories = ["algorithms", "database", "mathematics", "science"]
 
 [dependencies]
-pregel-rs = { version = "0.0.13" }
-wikidata-rs = { version = "0.0.4" }
-polars = { version = "0.30.0", features = ["lazy", "is_in", "performant", "parquet", "chunked_ids", "list_eval", "dtype-categorical", "rows", "is_first"] }
-duckdb = { version = "0.7.1" }
+pregel-rs = { path = "../pregel-rs" }
+wikidata-rs = { path = "../wd2duckdb/wikidata-rs" }
+polars = { version = "0.45.1", features = [
+    "lazy",
+    "is_in",
+    "performant",
+    "parquet",
+    "chunked_ids",
+    "list_eval",
+    "dtype-categorical",
+    "rows",
+    "is_first_distinct",
+] }
+duckdb = { version = "1.1.1" }
 rayon = "1.7.0"
-wikidata = "0.3.0"
-strum = "0.24.1"
-strum_macros = "0.24"
+wikidata = "1.1.0"
+strum = "0.26.3"
+strum_macros = "0.26.4"
 bimap = "0.6.3"
 rio_turtle = "0.8.4"
 rio_api = "0.8.4"
@@ -28,12 +38,12 @@ rio_api = "0.8.4"
 jemallocator = "0.5.0"
 
 [target.'cfg(target_env = "msvc")'.dependencies]
-mimalloc = { version = "0.1.37", default-features = false }
+mimalloc = { version = "0.1.43", default-features = false }
 
 [dev-dependencies]
-duckdb = { version="0.7.1", features=["bundled"] }
+duckdb = { version = "1.1.1", features = ["bundled"] }
 
 [profile.release]
 codegen-units = 1
 opt-level = 3
-lto = "thin"
+lto = "thin"
diff --git a/README.md b/README.md
@@ -5,98 +5,42 @@
 [![latest_version](https://img.shields.io/crates/v/pschema-rs)](https://crates.io/crates/pschema-rs)
 [![documentation](https://img.shields.io/docsrs/pschema-rs/latest)](https://docs.rs/pschema-rs/latest/pschema_rs/)
 
-`pschema-rs` is a Rust library that provides a Pregel-based schema validation algorithm for generating subsets of data 
+`pschema-rs` is a Rust library that provides a Pregel-based schema validation algorithm for generating subsets of data
 from Wikidata. It is designed to be efficient, scalable, and easy to use, making it suitable for a wide range of applications
 that involve processing large amounts of data from Wikidata.
 
 ## Features
 
-- **Pregel-based schema validation**: `pschema-rs` uses the Pregel model, a graph-based computation model, to perform 
-schema validation on Wikidata entities. This allows for efficient and scalable processing of large datasets.
+- **Pregel-based schema validation**: `pschema-rs` uses the Pregel model, a graph-based computation model, to perform
+  schema validation on Wikidata entities. This allows for efficient and scalable processing of large datasets.
 
 - **Rust implementation**: `pschema-rs` is implemented in Rust, a systems programming language known for its performance,
-memory safety, and concurrency features. This ensures that the library is fast, reliable, and safe to use.
+  memory safety, and concurrency features. This ensures that the library is fast, reliable, and safe to use.
 
-- **Wikidata subset generation**: `pschema-rs` provides functionality to generate subsets of data from Wikidata based on 
-schema validation rules. This allows users to filter and extract relevant data from Wikidata based on their specific 
-requirements.
+- **Wikidata subset generation**: `pschema-rs` provides functionality to generate subsets of data from Wikidata based on
+  schema validation rules. This allows users to filter and extract relevant data from Wikidata based on their specific
+  requirements.
 
-- **Customizable validation rules**: `pschema-rs` allows users to define their own validation rules using a simple and 
-flexible syntax. This makes it easy to customize the schema validation process according to the specific needs of a given
-application.
+- **Customizable validation rules**: `pschema-rs` allows users to define their own validation rules using a simple and
+  flexible syntax. This makes it easy to customize the schema validation process according to the specific needs of a given
+  application.
 
 - **Easy-to-use API**: `pschema-rs` provides a user-friendly API that makes it easy to integrate the library into any Rust
-project. The API provides a high-level interface for performing schema validation and generating Wikidata subsets, with
-comprehensive documentation and examples to help users get started quickly.
+  project. The API provides a high-level interface for performing schema validation and generating Wikidata subsets, with
+  comprehensive documentation and examples to help users get started quickly.
 
 ## Installation
 
 To use `pschema-rs` in your Rust project, you can add it as a dependency in your `Cargo.toml` file:
 
 ```toml
 [dependencies]
-pschema = "0.0.2"
+pschema = "0.0.4"
 ```
 
 ## Usage
 
-Here's an example of how you can use `pschema-rs` to perform schema validation and generate a subset of data from Wikidata.
-Note that what we are doing here is first, defining the `ShapeExpression` we want the algorithm to validate. Next, we import
-the Wikidata entities from a file. Note that the  import methods we have defined create an edge DataFrame, and as such, we
-need to call to the function `GraphFrame::from_edges(edges)`, which will build the GraphFrame from the imported edges. Lastly,
-by calling `PSchema::new(start).validate(graph)`, we will both construct the `PSchema` algorithm provided the `ShapeExpression`
-we have defined, first, and create the subset of the graph, second. Then, we print the results. Note that we can also export
-the results to a file. See the [examples](https://github.com/angelip2303/pschema-rs/tree/main/examples) for more information.
-
-```rust
-use pregel_rs::graph_frame::GraphFrame;
-use pschema_rs::backends::duckdb::DuckDB;
-use pschema_rs::backends::Backend;
-use pschema_rs::pschema::PSchema;
-use pschema_rs::shape::shex::Shape;
-use pschema_rs::shape::shex::NodeConstraint;
-use pschema_rs::shape::shex::TripleConstraint;
-use wikidata_rs::id::Id;
-
-fn main() -> Result<(), String> {
-    // Define validation rules
-    let start = Shape::TripleConstraint(TripleConstraint::new(
-        "City",
-        u32::from(Id::from("P31")),
-        NodeConstraint::Value(u32::from(Id::from("Q515"))),
-    ));
-
-    // Load Wikidata entities
-    let edges = DuckDB::import("./examples/from_duckdb/3000lines.duckdb")?;
-
-    // Perform schema validation
-    match GraphFrame::from_edges(edges) {
-        Ok(graph) => match PSchema::new(start).validate(graph) {
-            Ok(result) => {
-                println!("Schema validation result:");
-                println!("{:?}", result);
-                Ok(())
-            }
-            Err(error) => Err(error.to_string()),
-        },
-        Err(error) => Err(format!("Cannot create a GraphFrame: {}", error)),
-    }
-}
-
-```
-
-You could also run one of the examples to check how this library works:
-
-```sh
-cargo build
-cargo run --example from_duckdb
-```
-
-Or follow the guidelines explained in [examples/from_uniprot](https://github.com/angelip2303/pschema-rs/tree/main/examples/from_uniprot)
-where a more detailed use-case is shown.
-
-For more information on how to define validation rules, load entities from Wikidata, and process subsets of data, refer
-to the documentation.
+TBD
 
 ## Related projects
 
@@ -114,11 +58,11 @@ the Free Software Foundation, either version 3 of the License, or
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
-along with this program.  If not, see <https://www.gnu.org/licenses/>.
+along with this program. If not, see <https://www.gnu.org/licenses/>.
 
 **By contributing to this project, you agree to release your
 contributions under the same license.**
diff --git a/src/backends/duckdb.rs b/src/backends/duckdb.rs
@@ -35,7 +35,7 @@ impl Backend for DuckDB {
         let format = |id: DataType| {
             format!(
                 "SELECT src_id, property_id, CAST({:} AS UINTEGER) FROM {:}",
-                u32::from(Id::DataType(id.to_owned())),
+                u32::from(Id::DataType(id.clone())),
                 id.as_ref()
             )
         };
@@ -77,35 +77,35 @@ impl Backend for DuckDB {
             .map(|batch| {
                 match DataFrame::new(vec![
                     Series::new(
-                        Column::Subject.as_ref(),
-                        // because we know that the first column is the src_id
+                        Column::Subject.as_ptr(),
                         batch
                             .column(0)
                             .as_any()
                             .downcast_ref::<UInt32Array>()
                             .unwrap()
                             .values(),
-                    ),
+                    )
+                    .into(),
                     Series::new(
-                        Column::Predicate.as_ref(),
-                        // because we know that the second column is the property_id
+                        Column::Predicate.as_ptr(),
                         batch
                             .column(1)
                             .as_any()
                             .downcast_ref::<UInt32Array>()
                             .unwrap()
                             .values(),
-                    ),
+                    )
+                    .into(),
                     Series::new(
-                        Column::Object.as_ref(),
-                        // because we know that the third column is the dst_id
+                        Column::Object.as_ptr(),
                         batch
                             .column(2)
                             .as_any()
                             .downcast_ref::<UInt32Array>()
                             .unwrap()
                             .values(),
-                    ),
+                    )
+                    .into(),
                 ]) {
                     Ok(tmp_dataframe) => tmp_dataframe,
                     Err(_) => DataFrame::empty(),

diff --git a/src/backends/ntriples.rs b/src/backends/ntriples.rs
@@ -18,7 +18,7 @@ pub struct NTriples;
 
 impl Backend for NTriples {
     fn import(path: &str) -> Result<DataFrame, String> {
-        enable_string_cache(true);
+        enable_string_cache();
 
         let mut subjects = Vec::<String>::new();
         let mut predicates = Vec::<String>::new();
@@ -41,15 +41,14 @@ impl Backend for NTriples {
 
         while !parser.is_end() {
             if parser.parse_step(&mut on_triple).is_err() {
-                // We skip the line if it is not a valid triple
                 continue;
             }
         }
 
         match df![
-            Column::Subject.as_ref() => Series::new(Column::Subject.as_ref(), subjects).cast(&DataType::Categorical(None)).unwrap(),
-            Column::Predicate.as_ref() => Series::new(Column::Predicate.as_ref(), predicates).cast(&DataType::Categorical(None)).unwrap(),
-            Column::Object.as_ref() => Series::new(Column::Object.as_ref(), objects).cast(&DataType::Categorical(None)).unwrap(),
+            Column::Subject.as_ref() => Series::new(Column::Subject.as_ptr(), subjects).cast(&DataType::Categorical(None, CategoricalOrdering::Lexical)).unwrap(),
+            Column::Predicate.as_ref() => Series::new(Column::Predicate.as_ptr(), predicates).cast(&DataType::Categorical(None, CategoricalOrdering::Lexical)).unwrap(),
+            Column::Object.as_ref() => Series::new(Column::Object.as_ptr(), objects).cast(&DataType::Categorical(None, CategoricalOrdering::Lexical)).unwrap(),
         ] {
             Ok(edges) => Ok(edges),
             Err(_) => Err(String::from("Error creating the edges DataFrame")),
@@ -62,12 +61,12 @@ impl Backend for NTriples {
         let mut formatter = NTriplesFormatter::new(writer);
 
         let df = df
-            .to_owned()
+            .clone()
             .lazy()
             .select([
-                col(Column::Subject.as_ref()).cast(DataType::Utf8),
-                col(Column::Predicate.as_ref()).cast(DataType::Utf8),
-                col(Column::Object.as_ref()).cast(DataType::Utf8),
+                col(Column::Subject.as_ref()).cast(DataType::String),
+                col(Column::Predicate.as_ref()).cast(DataType::String),
+                col(Column::Object.as_ref()).cast(DataType::String),
             ])
             .collect()
             .unwrap();
@@ -82,7 +81,7 @@ impl Backend for NTriples {
                 .format(&Triple {
                     subject: match row.get(0) {
                         Some(subject) => match subject {
-                            AnyValue::Utf8(iri) => NamedNode {
+                            AnyValue::String(iri) => NamedNode {
                                 iri: &iri[1..iri.len() - 1],
                             }
                             .into(),
@@ -96,7 +95,7 @@ impl Backend for NTriples {
                     },
                     predicate: match row.get(1) {
                         Some(predicate) => match predicate {
-                            AnyValue::Utf8(iri) => NamedNode {
+                            AnyValue::String(iri) => NamedNode {
                                 iri: &iri[1..iri.len() - 1],
                             },
                             _ => {
@@ -109,7 +108,7 @@ impl Backend for NTriples {
                     },
                     object: match row.get(2) {
                         Some(object) => match object {
-                            AnyValue::Utf8(iri) => {
+                            AnyValue::String(iri) => {
                                 if iri.contains("^^") {
                                     let v: Vec<_> = iri.split("^^").collect();
                                     Literal::Typed {