From 24b36da9f611314c083fafa72acb166ac856f83a Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 3 Nov 2016 18:02:57 -0700 Subject: [PATCH 001/122] Lengthy battle with borrow checker We won --- src/query.rs | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/src/query.rs b/src/query.rs index 7a8d9da..b54b07d 100644 --- a/src/query.rs +++ b/src/query.rs @@ -69,8 +69,8 @@ impl DocResult { //struct QueryRuntimeFilter {} trait QueryRuntimeFilter { - fn first_result<'a>(&'a mut self, start_id: u64) -> Result, Error<'a>>; - fn next_result<'a>(&'a mut self) -> Result, Error<'a>>; + fn first_result(&mut self, start_id: u64) -> Result, Error>; + fn next_result(&mut self) -> Result, Error>; } pub struct Query {} @@ -181,13 +181,13 @@ impl<'a> QueryRuntimeFilter for ExactMatchFilter<'a> { } -struct AndFilter { - filters: Vec>, +struct AndFilter<'a> { + filters: Vec>, array_depth: u64, } -impl AndFilter { - fn new(filters: Vec>, array_depth: u64) -> AndFilter { +impl<'a> AndFilter<'a> { + fn new(filters: Vec>, array_depth: u64) -> AndFilter<'a> { AndFilter { filters: filters, array_depth: array_depth, @@ -195,7 +195,7 @@ impl AndFilter { } } -impl QueryRuntimeFilter for AndFilter { +impl<'a> QueryRuntimeFilter for AndFilter<'a> { fn first_result(&mut self, start_id: u64) -> Result, Error> { Ok(None) } @@ -206,20 +206,20 @@ impl QueryRuntimeFilter for AndFilter { -struct Parser<'a, 'b> { +struct Parser<'a> { query: &'a str, offset: usize, kb: KeyBuilder, - snapshot: &'b Snapshot<'b>, + snapshot: &'a Snapshot<'a>, } -impl<'a, 'b> Parser<'a, 'b> { - fn new(query: &'a str, snapshot: &'b Snapshot) -> Parser<'a, 'b> { +impl<'a> Parser<'a> { + fn new(query: &'a str, snapshot: &'a Snapshot<'a>) -> Parser<'a> { Parser{ query: query, offset: 0, kb: KeyBuilder::new(), - snapshot: snapshot, + snapshot: &snapshot, } } @@ -287,7 +287,7 @@ impl<'a, 'b> Parser<'a, 'b> { } } - fn compare(&mut self) -> Result, String> { + fn compare<'b>(&'b mut self) -> Result, String> { match self.consume_field() { Some(field) => { if self.consume(".") { @@ -301,7 +301,7 @@ impl<'a, 'b> Parser<'a, 'b> { self.kb.push_object_key(field); let stems = Stems::new(&literal); - let mut filters: Vec> = Vec::new(); + let mut filters: Vec> = Vec::new(); for stem in stems { let iter = self.snapshot.iter(); let filter = Box::new(ExactMatchFilter::new( @@ -313,8 +313,8 @@ impl<'a, 'b> Parser<'a, 'b> { match filters.len() { 0 => panic!("Cannot create a ExactMatchFilter"), - 1 => Ok(filters[0]), - _ => Ok(Box::new(AndFilter::new( + 1 => Ok(filters.pop().unwrap()), + _ => Ok(Box::new(AndFilter::<'a>::new( filters, self.kb.array_depth as u64))), //_ => Ok(filters[0]), //_ => Err("just get it compiled".to_string()), @@ -336,10 +336,6 @@ impl<'a, 'b> Parser<'a, 'b> { } } - fn boolean() -> Option { - - None - } fn build_filter(&mut self) -> Result { @@ -350,7 +346,7 @@ impl<'a, 'b> Parser<'a, 'b> { impl Query { //pub fn get_matches<'a>(query: &str, index: &'a Index) -> Result { - pub fn get_matches<'a, 'b>(query: &str, snapshot: &'b Snapshot) -> Result { + pub fn get_matches<'a>(query: &str, snapshot: &'a Snapshot) -> Result { // match &index.rocks { // &Some(ref rocks) => { // let snapshot = Snapshot::new(rocks); From be586e93568c0d975f47b4dd98535f9fc7582eee Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Sun, 6 Nov 2016 17:52:49 -0800 Subject: [PATCH 002/122] Push current state --- src/query.rs | 129 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 112 insertions(+), 17 deletions(-) diff --git a/src/query.rs b/src/query.rs index b54b07d..cf24290 100644 --- a/src/query.rs +++ b/src/query.rs @@ -14,15 +14,15 @@ use rocksdb::rocksdb::Snapshot; use records_capnp::payload; #[derive(Debug)] -enum Error<'a> { - Parse(&'a str), +enum Error { + Parse(String), Capnp(capnp::Error), } -impl<'a> error::Error for Error<'a> { +impl error::Error for Error { fn description(&self) -> &str { match *self { - Error::Parse(description) => description, + Error::Parse(ref description) => description, Error::Capnp(ref err) => err.description(), } } @@ -35,13 +35,13 @@ impl<'a> error::Error for Error<'a> { } } -impl<'a> From for Error<'a> { - fn from(err: capnp::Error) -> Error<'a> { +impl From for Error { + fn from(err: capnp::Error) -> Error { Error::Capnp(err) } } -impl<'a> fmt::Display for Error<'a> { +impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Error::Parse(ref err) => write!(f, "Parse error: {}", err), @@ -63,6 +63,31 @@ impl DocResult { array_paths: Vec::new(), } } + + fn truncate_array_paths(&mut self, array_depth: usize) { + for array_path in self.array_paths.iter_mut() { + debug_assert!(array_path.len() >= array_depth); + array_path.resize(array_depth, 0); + } + } + + fn intersect_array_paths(aa: &DocResult, bb: &DocResult) -> Option { + let mut doc_result = DocResult::new(); + debug_assert_eq!(aa.seq, bb.seq); + doc_result.seq = aa.seq; + for array_path_a in &aa.array_paths { + for array_path_b in &bb.array_paths { + if array_path_a == array_path_b { + doc_result.array_paths.push(array_path_a.clone()); + } + } + } + if doc_result.array_paths.is_empty() { + None + } else { + Some(doc_result) + } + } } //trait QueryRuntimeFilter { @@ -181,25 +206,89 @@ impl<'a> QueryRuntimeFilter for ExactMatchFilter<'a> { } + +struct DummyFilter {} + +impl QueryRuntimeFilter for DummyFilter { + fn first_result(&mut self, start_id: u64) -> Result, Error> { + Ok(None) + } + fn next_result(&mut self) -> Result, Error> { + Ok(None) + } +} + + struct AndFilter<'a> { filters: Vec>, - array_depth: u64, + current_filter: usize, + array_depth: usize, } impl<'a> AndFilter<'a> { - fn new(filters: Vec>, array_depth: u64) -> AndFilter<'a> { + fn new(filters: Vec>, array_depth: usize) -> AndFilter<'a> { AndFilter { filters: filters, + current_filter: 0, array_depth: array_depth, } } + + fn result(&mut self, base: Option) -> Result, Error> { + let mut matches_count = self.filters.len(); + // TODO vmx 2016-11-04: Make it nicer + let mut base_result = match base { + Some(base_result) => base_result, + None => return Ok(None), + }; + loop { + base_result.truncate_array_paths(self.array_depth); + + self.current_filter += 1; + if self.current_filter > self.filters.len() { + self.current_filter = 0; + } + + let next = try!(self.filters[self.current_filter].first_result(base_result.seq)); + let mut next_result = match next { + Some(next_result) => next_result, + None => return Ok(None), + }; + next_result.truncate_array_paths(self.array_depth); + + if base_result.seq == next_result.seq { + match DocResult::intersect_array_paths(&base_result, &next_result) { + Some(new_result) => { + base_result = new_result; + matches_count -= 1; + }, + None => { + let new_result = try!(self.filters[self.current_filter].first_result(base_result.seq)); + base_result = match new_result { + Some(base_result) => base_result, + None => return Ok(None), + }; + matches_count = self.filters.len() + } + } + } else { + base_result = next_result; + matches_count = self.filters.len(); + } + } + } } impl<'a> QueryRuntimeFilter for AndFilter<'a> { fn first_result(&mut self, start_id: u64) -> Result, Error> { - Ok(None) + let base_result = try!(self.filters[self.current_filter].first_result(start_id)); + self.result(base_result) + //Ok(None) } + fn next_result(&mut self) -> Result, Error> { + //let base_result = try!(self.filters[self.current_filter].next_result()); + //self.result(base_result) Ok(None) } } @@ -287,6 +376,10 @@ impl<'a> Parser<'a> { } } + fn array() { + //XXX GO ON HERE 2016-11-04 + } + fn compare<'b>(&'b mut self) -> Result, String> { match self.consume_field() { Some(field) => { @@ -315,19 +408,23 @@ impl<'a> Parser<'a> { 0 => panic!("Cannot create a ExactMatchFilter"), 1 => Ok(filters.pop().unwrap()), _ => Ok(Box::new(AndFilter::<'a>::new( - filters, self.kb.array_depth as u64))), - //_ => Ok(filters[0]), - //_ => Err("just get it compiled".to_string()), + filters, self.kb.array_depth))), } }, // Empty literal - Ok(None) => {Err("not implemetned yet".to_string())}, + Ok(None) => {Err("Expected string".to_string())}, Err(error) => { Err(error) } } + } else if self.could_consume("[") { + self.kb.push_object_key(field); + //let ret = self.array(); + self.kb.pop_object_key(); + //ret + Err("Not yet implemented".to_string()) } else { - Err("not yet implemented".to_string()) + Err("Expected comparison or array operator".to_string()) } }, None => { @@ -335,8 +432,6 @@ impl<'a> Parser<'a> { } } } - - fn build_filter(&mut self) -> Result { self.whitespace(); From d6ee2f7a1b7de6aea68cd6adfdaaa16bb7817688 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Mon, 7 Nov 2016 09:58:51 -0800 Subject: [PATCH 003/122] Adding array and bool filter --- src/query.rs | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/src/query.rs b/src/query.rs index cf24290..c70e4be 100644 --- a/src/query.rs +++ b/src/query.rs @@ -355,7 +355,7 @@ impl<'a> Parser<'a> { } } - fn consume_string_literal(&mut self) -> Result, String> { + fn consume_string_literal(&mut self) -> Result, Error> { // Does not unescape yet let mut lit = String::new(); if self.consume("\"") { @@ -369,18 +369,46 @@ impl<'a> Parser<'a> { self.whitespace(); Ok(Some(lit)) } else { - Err("Expected \"".to_string()) + Err(Error::Parse("Expected \"".to_string())) } } else { Ok(None) } } - fn array() { - //XXX GO ON HERE 2016-11-04 + fn bool(&mut self) -> Result, Error> { + let left = try!(self.compare()); + let mut filters = vec![left]; + loop { + if !self.consume("&") { + break; + } + + let right = try!(self.compare()); + filters.push(right); + } + if filters.len() == 1 { + Ok(filters.pop().unwrap()) + } else { + Ok(Box::new(AndFilter::new(filters, self.kb.array_depth))) + } } - fn compare<'b>(&'b mut self) -> Result, String> { + + fn array(&mut self) -> Result, Error> { + if !self.consume("[") { + return Err(Error::Parse("Expected '['".to_string())); + } + self.kb.push_array(); + let filter = try!(self.bool()); + self.kb.pop_array(); + if self.consume("]") { + return Err(Error::Parse("Expected ']'".to_string())); + } + Ok(filter) + } + + fn compare<'b>(&'b mut self) -> Result, Error> { match self.consume_field() { Some(field) => { if self.consume(".") { @@ -412,23 +440,22 @@ impl<'a> Parser<'a> { } }, // Empty literal - Ok(None) => {Err("Expected string".to_string())}, + Ok(None) => {Err(Error::Parse("Expected string".to_string()))}, Err(error) => { Err(error) } } } else if self.could_consume("[") { self.kb.push_object_key(field); - //let ret = self.array(); + let ret = self.array(); self.kb.pop_object_key(); - //ret - Err("Not yet implemented".to_string()) + ret } else { - Err("Expected comparison or array operator".to_string()) + Err(Error::Parse("Expected comparison or array operator".to_string())) } }, None => { - Err("Expected comparison or array operator".to_string()) + Err(Error::Parse("Expected comparison or array operator".to_string())) } } } From 9a2c3e37e5094105cf9588d7c7f65924cb6357c7 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Mon, 7 Nov 2016 10:13:33 -0800 Subject: [PATCH 004/122] Adding factor filter --- src/query.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/query.rs b/src/query.rs index c70e4be..b0fb772 100644 --- a/src/query.rs +++ b/src/query.rs @@ -408,6 +408,21 @@ impl<'a> Parser<'a> { Ok(filter) } + fn factor(&mut self) -> Result, Error> { + if self.consume("(") { + let filter = try!(self.bool()); + if !self.consume(")") { + Err(Error::Parse("Expected ')'".to_string())) + } else { + Ok(filter) + } + } else if self.could_consume("[") { + self.array() + } else { + Err(Error::Parse("Missing Expression".to_string())) + } + } + fn compare<'b>(&'b mut self) -> Result, Error> { match self.consume_field() { Some(field) => { @@ -455,7 +470,7 @@ impl<'a> Parser<'a> { } }, None => { - Err(Error::Parse("Expected comparison or array operator".to_string())) + self.factor() } } } From 3cdf4dc1974452955d3238ee6905cd1ae22b696e Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Mon, 7 Nov 2016 10:18:30 -0800 Subject: [PATCH 005/122] Implement build_filter() --- src/query.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query.rs b/src/query.rs index b0fb772..c86a5d1 100644 --- a/src/query.rs +++ b/src/query.rs @@ -475,9 +475,9 @@ impl<'a> Parser<'a> { } } - fn build_filter(&mut self) -> Result { + fn build_filter(&mut self) -> Result, Error> { self.whitespace(); - Ok(QueryResults{}) + self.bool() } } From 87ebf58341de21e45d89ebbf5b23bfaa065fc4a4 Mon Sep 17 00:00:00 2001 From: Michael Nitschinger Date: Mon, 7 Nov 2016 16:18:01 -0800 Subject: [PATCH 006/122] Add capnp as required dependencies (#1) capnp should be specified to be installed, otherwise building fails with: ``` thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: Error { kind: Failed, description: "Error while trying to execute `capnp compile`: Failed: No such file or directory (os error 2). Please verify that version 0.5.2 or higher of the capnp executable is installed on your system. See https://capnproto.org/install.html" }', ../src/libcore/result.rs:788 note: Run with `RUST_BACKTRACE=1` for a backtrace. ``` --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 13f557f..1caf545 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Installation ### Dependencies * [RocksDB](http://rocksdb.org/) + * [capnp-tool](https://capnproto.org/capnp-tool.html) ### Build From 6800c934549296375bcc89c4f8b99e7d7ede3bc1 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Mon, 7 Nov 2016 20:34:38 -0800 Subject: [PATCH 007/122] Compiles with Spacejam's rust-rocksdb --- Cargo.toml | 6 ++- src/index.rs | 30 +++++------ src/query.rs | 146 ++++++++++++++++++++++++++++++++++----------------- 3 files changed, 118 insertions(+), 64 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6ced680..61c5452 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,8 +17,10 @@ stemmer = "0.3.2" unicode-normalization = "0.1.2" [dependencies.rocksdb] -git = "https://github.com/vmx/rust-rocksdb.git" -branch = "vmx" +#git = "https://github.com/vmx/rust-rocksdb.git" +#branch = "vmx" +path = "../spacejam-rocksdb" +#git = "https://github.com/spacejam/rust-rocksdb.git" [dependencies.unicode-segmentation] git = "https://github.com/vmx/unicode-segmentation.git" diff --git a/src/index.rs b/src/index.rs index 1e644ef..4115a03 100644 --- a/src/index.rs +++ b/src/index.rs @@ -7,7 +7,8 @@ use records_capnp::header; use self::rocksdb::Writable; use json_shred::{Shredder}; - +// TODO vmx 2016-11-07: Move errors into their own module +use query::Error; const NOISE_HEADER_VERSION: u64 = 1; @@ -38,7 +39,6 @@ impl Header { pub struct Index { - read_options: rocksdb::ReadOptions, write_options: rocksdb::WriteOptions, high_doc_seq: u64, pub rocks: Option, @@ -53,27 +53,26 @@ pub enum OpenOptions { impl Index { pub fn new() -> Index { Index { - read_options: rocksdb::ReadOptions::new(), write_options: rocksdb::WriteOptions::new(), high_doc_seq: 0, rocks: None, id_str_to_id_seq: HashMap::new(), - batch: rocksdb::WriteBatch::new(), + batch: rocksdb::WriteBatch::default(), } } // NOTE vmx 2016-10-13: Perhpas the name should be specified on `new()` as it is bound // to a single instance. The next question would then be if `open()` really makes sense // or if it should be combined with `new()`. //fn open(&mut self, name: &str, open_options: Option) -> Result { - pub fn open(&mut self, name: &str, open_options: Option) -> Result<(), String> { - let mut rocks_options = rocksdb::Options::new(); + pub fn open(&mut self, name: &str, open_options: Option) -> Result<(), Error> { + let mut rocks_options = rocksdb::Options::default(); println!("still here1"); let rocks = match rocksdb::DB::open(&rocks_options, name) { Ok(rocks) => rocks, Err(error) => { match open_options { Some(OpenOptions::Create) => (), - _ => return Err(error), + _ => return Err(Error::Rocks(error)), } rocks_options.create_if_missing(true); @@ -88,7 +87,7 @@ impl Index { }; // validate header is there - let value = try!(rocks.get_opt(b"HDB", &self.read_options)).unwrap(); + let value = try!(rocks.get(b"HDB")).unwrap(); // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed // and why we pass on mutable reference of it to `read_message()` let mut ref_value = &*value; @@ -103,8 +102,9 @@ impl Index { // NOTE vmx 2016-10-13: As one index is tied to one database, this should be a method // without a parameter - pub fn delete(name: &str) -> Result<(), String> { - rocksdb::DB::destroy(&rocksdb::Options::new(), name) + pub fn delete(name: &str) -> Result<(), Error> { + let ret = try!(rocksdb::DB::destroy(&rocksdb::Options::default(), name)); + Ok(ret) } pub fn add(&mut self, json: &str) -> Result<(), String> { @@ -121,7 +121,7 @@ impl Index { } // Store the current batch - pub fn flush(&mut self) -> Result<(), String> { + pub fn flush(&mut self) -> Result<(), Error> { // Flush can only be called if the index is open // NOTE vmx 2016-10-17: Perhaps that shouldn't panic? assert!(&self.rocks.is_some()); @@ -130,7 +130,7 @@ impl Index { // Look up all doc ids and 'delete' from the seq_to_ids keyspace for key in self.id_str_to_id_seq.keys() { // TODO vmx 2016-10-17: USe multiget once the Rusts wrapper supports it - match rocks.get_opt(key.as_bytes(), &self.read_options) { + match rocks.get(key.as_bytes()) { Ok(Some(seq)) => { try!(self.batch.delete(&*seq)); }, @@ -148,10 +148,10 @@ impl Index { header.high_seq = self.high_doc_seq; try!(self.batch.put(b"HDB", &*header.serialize())); - let status = rocks.write_opt(&self.batch, &self.write_options); + let status = try!(rocks.write(&self.batch)); self.batch.clear(); self.id_str_to_id_seq.clear(); - status + Ok(status) } pub fn fetch_id(&self, seq: u64) -> Result, String> { @@ -161,7 +161,7 @@ impl Index { let rocks = self.rocks.as_ref().unwrap(); let key = format!("S{}", seq); - match try!(rocks.get_opt(&key.as_bytes(), &self.read_options)) { + match try!(rocks.get(&key.as_bytes())) { // If there is an id, it's UTF-8 Some(id) => Ok(Some(id.to_utf8().unwrap().to_string())), None => Ok(None) diff --git a/src/query.rs b/src/query.rs index c86a5d1..989bacf 100644 --- a/src/query.rs +++ b/src/query.rs @@ -9,14 +9,15 @@ use key_builder::KeyBuilder; use stems::{StemmedWord, Stems}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{DBIterator, SeekKey}; -use rocksdb::rocksdb::Snapshot; +use rocksdb::{self, DBIterator, IteratorMode, Snapshot}; use records_capnp::payload; + #[derive(Debug)] -enum Error { +pub enum Error { Parse(String), Capnp(capnp::Error), + Rocks(rocksdb::Error), } impl error::Error for Error { @@ -24,6 +25,10 @@ impl error::Error for Error { match *self { Error::Parse(ref description) => description, Error::Capnp(ref err) => err.description(), + // XXX vmx 2016-11-07: It should be fixed on the RocksDB wrapper + // that it has the std::error:Error implemented and hence + // and err.description() + Error::Rocks(_) => "This is an rocksdb error", } } @@ -31,6 +36,9 @@ impl error::Error for Error { match *self { Error::Parse(_) => None, Error::Capnp(ref err) => Some(err as &error::Error), + // NOTE vmx 2016-11-07: Looks like the RocksDB Wrapper needs to be + // patched to be based on the std::error::Error trait + Error::Rocks(ref err) => None, } } } @@ -41,17 +49,24 @@ impl From for Error { } } +impl From for Error { + fn from(err: rocksdb::Error) -> Error { + Error::Rocks(err) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Error::Parse(ref err) => write!(f, "Parse error: {}", err), Error::Capnp(ref err) => write!(f, "Capnproto error: {}", err), + Error::Rocks(ref err) => write!(f, "RocksDB error: {}", err), } } } -struct DocResult { +pub struct DocResult { seq: u64, array_paths: Vec>, } @@ -93,14 +108,56 @@ impl DocResult { //trait QueryRuntimeFilter { //struct QueryRuntimeFilter {} -trait QueryRuntimeFilter { +pub trait QueryRuntimeFilter { fn first_result(&mut self, start_id: u64) -> Result, Error>; fn next_result(&mut self) -> Result, Error>; } pub struct Query {} -pub struct QueryResults { +pub struct QueryResults<'a> { + filter: Box, + first_has_been_called: bool, + snapshot: Snapshot<'a>, +} + +impl<'a> QueryResults<'a> { + fn new(filter: Box, snapshot: Snapshot<'a>) -> QueryResults<'a> { + QueryResults{ + filter: filter, + first_has_been_called: false, + snapshot: snapshot, + } + } + + fn get_next(&mut self) -> Result, Error> { + let doc_result; + if self.first_has_been_called { + doc_result = try!(self.filter.next_result()); + } else { + self.first_has_been_called = true; + doc_result = try!(self.filter.first_result(0)); + } + match doc_result { + Some(doc_result) => Ok(Some(doc_result.seq)), + None => Ok(None), + } + } + + fn get_next_id(&mut self) -> Result, Error> { + let seq = try!(self.get_next()); + match seq { + Some(seq) => { + let key = format!("S{}", seq); + match try!(self.snapshot.get(&key.as_bytes())) { + // If there is an id, it's UTF-8 + Some(id) => Ok(Some(id.to_utf8().unwrap().to_string())), + None => Ok(None) + } + }, + None => Ok(None), + } + } } //struct SnapshotIteratorCreator { @@ -122,16 +179,16 @@ pub struct QueryResults { -struct ExactMatchFilter<'a> { - iter: DBIterator<'a>, +struct ExactMatchFilter { + iter: DBIterator, kb: KeyBuilder, stemmed_offset: u64, suffix: String, suffix_offset: u64, } -impl<'a> ExactMatchFilter<'a> { - fn new(iter: DBIterator<'a>, stemmed_word: &StemmedWord, mut kb: KeyBuilder) -> ExactMatchFilter<'a> { +impl ExactMatchFilter { + fn new(iter: DBIterator, stemmed_word: &StemmedWord, mut kb: KeyBuilder) -> ExactMatchFilter { kb.push_word(&stemmed_word.stemmed); ExactMatchFilter{ iter: iter, @@ -143,13 +200,14 @@ impl<'a> ExactMatchFilter<'a> { } } -impl<'a> QueryRuntimeFilter for ExactMatchFilter<'a> { +impl QueryRuntimeFilter for ExactMatchFilter { fn first_result(&mut self, start_id: u64) -> Result, Error> { // Build the full key self.kb.push_doc_seq(start_id); // Seek in index to >= entry - self.iter.seek(SeekKey::from(self.kb.key().as_bytes())); + self.iter.set_mode(IteratorMode::From(self.kb.key().as_bytes(), + rocksdb::Direction::Forward)); // Revert self.kb.pop_doc_seq(); @@ -168,13 +226,15 @@ impl<'a> QueryRuntimeFilter for ExactMatchFilter<'a> { // New scope needed as the iter.next() below invalidates the // current key and value { - let key = self.iter.key(); + let (key, value) = match self.iter.next() { + Some((key, value)) => (key, value), + None => return Ok(None), + }; if !key.starts_with(self.kb.key().as_bytes()) { return Ok(None) } let seq = &key[self.kb.key().len()..]; - let value = self.iter.value(); // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed // and why we pass on mutable reference of it to `read_message()` let mut ref_value = &*value; @@ -283,32 +343,30 @@ impl<'a> QueryRuntimeFilter for AndFilter<'a> { fn first_result(&mut self, start_id: u64) -> Result, Error> { let base_result = try!(self.filters[self.current_filter].first_result(start_id)); self.result(base_result) - //Ok(None) } fn next_result(&mut self) -> Result, Error> { - //let base_result = try!(self.filters[self.current_filter].next_result()); - //self.result(base_result) - Ok(None) + let base_result = try!(self.filters[self.current_filter].next_result()); + self.result(base_result) } } struct Parser<'a> { - query: &'a str, + query: String, offset: usize, kb: KeyBuilder, - snapshot: &'a Snapshot<'a>, + snapshot: Snapshot<'a>, } impl<'a> Parser<'a> { - fn new(query: &'a str, snapshot: &'a Snapshot<'a>) -> Parser<'a> { + fn new(query: String, snapshot: Snapshot<'a>) -> Parser<'a> { Parser{ query: query, offset: 0, kb: KeyBuilder::new(), - snapshot: &snapshot, + snapshot: snapshot, } } @@ -376,7 +434,7 @@ impl<'a> Parser<'a> { } } - fn bool(&mut self) -> Result, Error> { + fn bool<'b>(&'b mut self) -> Result, Error> { let left = try!(self.compare()); let mut filters = vec![left]; loop { @@ -395,7 +453,7 @@ impl<'a> Parser<'a> { } - fn array(&mut self) -> Result, Error> { + fn array<'b>(&'b mut self) -> Result, Error> { if !self.consume("[") { return Err(Error::Parse("Expected '['".to_string())); } @@ -408,7 +466,7 @@ impl<'a> Parser<'a> { Ok(filter) } - fn factor(&mut self) -> Result, Error> { + fn factor<'b>(&'b mut self) -> Result, Error> { if self.consume("(") { let filter = try!(self.bool()); if !self.consume(")") { @@ -439,7 +497,7 @@ impl<'a> Parser<'a> { let stems = Stems::new(&literal); let mut filters: Vec> = Vec::new(); for stem in stems { - let iter = self.snapshot.iter(); + let iter = self.snapshot.iterator(IteratorMode::Start); let filter = Box::new(ExactMatchFilter::new( iter, &stem, self.kb.clone())); filters.push(filter); @@ -450,7 +508,7 @@ impl<'a> Parser<'a> { match filters.len() { 0 => panic!("Cannot create a ExactMatchFilter"), 1 => Ok(filters.pop().unwrap()), - _ => Ok(Box::new(AndFilter::<'a>::new( + _ => Ok(Box::new(AndFilter::new( filters, self.kb.array_depth))), } }, @@ -475,31 +533,25 @@ impl<'a> Parser<'a> { } } - fn build_filter(&mut self) -> Result, Error> { + fn build_filter(mut self) -> Result<(Box, Snapshot<'a>), Error> { self.whitespace(); - self.bool() + Ok((self.bool().unwrap(), self.snapshot)) } } impl Query { - //pub fn get_matches<'a>(query: &str, index: &'a Index) -> Result { - pub fn get_matches<'a>(query: &str, snapshot: &'a Snapshot) -> Result { - // match &index.rocks { -// &Some(ref rocks) => { -// let snapshot = Snapshot::new(rocks); -// let parser = Parser::new(query, &snapshot); -// Ok(QueryResults{}) -// }, -// &None => { -// Err("You must open the index first".to_string()) -// }, -// } - //let rocks = &index.rocks.unwrap(); - // This one would work as well - //let rocks = index.rocks.as_ref().unwrap(); - //let snapshot = Snapshot::new(rocks); - let parser = Parser::new(query, &snapshot); - Ok(QueryResults{}) + pub fn get_matches<'a>(query: String, index: &'a Index) -> Result, Error> { + match index.rocks { + Some(ref rocks) => { + let snapshot = Snapshot::new(&rocks); + let parser = Parser::new(query, snapshot); + let (filter, snapshot2) = try!(parser.build_filter()); + Ok(QueryResults::new(filter, snapshot2)) + }, + None => { + Err(Error::Parse("You must open the index first".to_string())) + }, + } } } From e79fba381c4c91f5071dbbf2754310e8ae63818d Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 8 Nov 2016 12:21:59 -0800 Subject: [PATCH 008/122] Fix parsing problems with string literals and whitespace --- src/lib.rs | 2 +- src/query.rs | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4adf6aa..22ffd37 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ mod json_shred; mod key_builder; mod stems; pub mod index; -mod query; +pub mod query; // include capnp code generated by `build.rs` mod records_capnp { diff --git a/src/query.rs b/src/query.rs index 989bacf..71f101a 100644 --- a/src/query.rs +++ b/src/query.rs @@ -144,7 +144,7 @@ impl<'a> QueryResults<'a> { } } - fn get_next_id(&mut self) -> Result, Error> { + pub fn get_next_id(&mut self) -> Result, Error> { let seq = try!(self.get_next()); match seq { Some(seq) => { @@ -372,12 +372,14 @@ impl<'a> Parser<'a> { fn whitespace(&mut self) { loop { - if let Some(char) = self.query[self.offset..].chars().next() { - // Stop when the character isn't a whitespace - if !char.is_whitespace() { - break; - } - self.offset += char.len_utf8(); + match self.query[self.offset..].chars().next() { + Some(char) => { + if !char.is_whitespace() { + break; + } + self.offset += char.len_utf8(); + }, + None => break, } } } @@ -418,9 +420,10 @@ impl<'a> Parser<'a> { let mut lit = String::new(); if self.consume("\"") { for char in self.query[self.offset..].chars() { - if char != '"' { - lit.push(char); + if char == '"' { + break; } + lit.push(char); self.offset += char.len_utf8(); } if self.consume("\"") { From 1e1ee454ffa00e7ae76e7764da52db760c7570ab Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 8 Nov 2016 12:30:49 -0800 Subject: [PATCH 009/122] Fix unit tests --- src/key_builder.rs | 8 ++++---- src/query.rs | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/key_builder.rs b/src/key_builder.rs index e3bb184..1c208a5 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -144,7 +144,7 @@ mod tests { assert_eq!(kb.segments.len(), 3, "Three segments "); assert_eq!(kb.key(), "W.first.second$", "Key for three segments is correct"); - kb.push_word("astemmedword".to_string()); + kb.push_word("astemmedword"); assert_eq!(kb.segments.len(), 4, "Four segments"); assert_eq!(kb.key(), "W.first.second$!astemmedword#", "Key for four segments is correct"); @@ -165,7 +165,7 @@ mod tests { #[should_panic(expected = "assertion failed: self.segments.len() > 0")] fn test_segments_push_word_panic() { let mut kb = KeyBuilder::new(); - kb.push_word("astemmedword".to_string()); + kb.push_word("astemmedword"); } #[test] @@ -174,7 +174,7 @@ mod tests { kb.push_object_key("first".to_string()); kb.push_object_key("second".to_string()); kb.push_array(); - kb.push_word("astemmedword".to_string()); + kb.push_word("astemmedword"); kb.push_doc_seq(123); assert_eq!(kb.segments.len(), 5, "Five segments"); assert_eq!(kb.key(), "W.first.second$!astemmedword#123", @@ -218,7 +218,7 @@ mod tests { assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::Array), "Last segment is an array"); - kb.push_word("astemmedword".to_string()); + kb.push_word("astemmedword"); assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::Word), "Last segment is a word"); diff --git a/src/query.rs b/src/query.rs index 71f101a..ae776c7 100644 --- a/src/query.rs +++ b/src/query.rs @@ -571,15 +571,16 @@ mod tests { let mut index = Index::new(); index.open("test_whitespace", Some(OpenOptions::Create)).unwrap(); let rocks = &index.rocks.unwrap(); - let snapshot = Snapshot::new(rocks); + let mut snapshot = Snapshot::new(rocks); - let mut query = " \n \t test"; - let mut parser = Parser::new(query, &snapshot); + let mut query = " \n \t test".to_string(); + let mut parser = Parser::new(query, snapshot); parser.whitespace(); assert_eq!(parser.offset, 5); - query = "test"; - parser = Parser::new(query, &snapshot); + snapshot = Snapshot::new(rocks); + query = "test".to_string(); + parser = Parser::new(query, snapshot); parser.whitespace(); assert_eq!(parser.offset, 0); } From 479e9021b86e10286f19cfb61d71afd6fae1dab4 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 8 Nov 2016 13:08:02 -0800 Subject: [PATCH 010/122] Deal with empty objects properly TODO: finish the unit test --- src/json_shred.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 1860076..398b69a 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -82,7 +82,6 @@ impl Shredder { pub fn shred(&mut self, json: &str, docseq: u64) -> Result<&str, String> { - println!("{}", json); let mut parser = Parser::new(json.chars()); let mut token = parser.next(); @@ -95,22 +94,24 @@ impl Shredder { Some(JsonEvent::ObjectStart) => { match parser.stack().top() { Some(StackElement::Key(key)) => { - println!("object start: {:?}", key); self.keybuilder.push_object_key(key.to_string()); self.inc_top_array_offset(); }, - _ => { - panic!("XXX This is probably an object end"); - } + // We won't hit this case as we are within an object and not within + // an array + Some(StackElement::Index(_)) => {} + // It's an empty object + None => { + // Just push something to make `ObjectEnd` happy + self.keybuilder.push_object_key("".to_string()); + }, } }, Some(JsonEvent::ObjectEnd) => { self.keybuilder.pop_object_key(); }, Some(JsonEvent::ArrayStart) => { - println!("array start"); self.keybuilder.push_array(); - //self.inc_top_array_offset(); self.path_array_offsets.push(0); }, Some(JsonEvent::ArrayEnd) => { @@ -120,7 +121,6 @@ impl Shredder { Some(JsonEvent::StringValue(value)) => { self.add_entries(value, docseq); self.inc_top_array_offset(); - //self.keybuilder.pop_object_key(); }, not_implemented => { panic!("Not yet implemented other JSON types! {:?}", not_implemented); @@ -255,4 +255,13 @@ mod tests { } } } + + #[test] + fn test_shred_empty_object() { + let mut shredder = super::Shredder::new(); + let json = r#"{}"#; + let docseq = 123; + shredder.shred(json, docseq).unwrap(); + // TODO vmx 2016-11-08: Finish this test + } } From b6aa2a593848530a3a68aa85991dafd9b2520689 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 8 Nov 2016 19:43:41 -0800 Subject: [PATCH 011/122] First query works --- src/index.rs | 2 +- src/json_shred.rs | 141 ++++++++++++++++++++++++++++++++++++--------- src/key_builder.rs | 4 +- src/main.rs | 18 ++++++ src/query.rs | 4 ++ 5 files changed, 139 insertions(+), 30 deletions(-) create mode 100644 src/main.rs diff --git a/src/index.rs b/src/index.rs index 4115a03..e1b7cc1 100644 --- a/src/index.rs +++ b/src/index.rs @@ -107,7 +107,7 @@ impl Index { Ok(ret) } - pub fn add(&mut self, json: &str) -> Result<(), String> { + pub fn add(&mut self, json: &str) -> Result<(), Error> { let mut shredder = Shredder::new(); // NOTE vmx 2016-10-13: Needed for the lifetime-checker, though not sure if it now really // does the right thing. Does the `try!()` still return as epected? diff --git a/src/json_shred.rs b/src/json_shred.rs index 398b69a..828dae3 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -10,6 +10,7 @@ use self::rocksdb::Writable; use key_builder::{KeyBuilder, SegmentType}; use records_capnp::payload; use stems::Stems; +use query::{Error}; // Good example of using rustc_serialize: https://github.com/ajroetker/beautician/blob/master/src/lib.rs // Callback based JSON streaming parser: https://github.com/gyscos/json-streamer.rs @@ -33,11 +34,26 @@ type ArrayOffsets = Vec; type ArrayOffsetsToWordInfo = HashMap>; type WordPathInfoMap = HashMap; + +enum ObjectKeyTypes { + /// _id field + Id, + /// Normal key + Key(String), + /// Reserved key starting with underscore + Ignore, + /// No key found + NoKey, +} + #[derive(Debug)] pub struct Shredder { keybuilder: KeyBuilder, map: WordPathInfoMap, path_array_offsets: ArrayOffsets, + // Top-level fields prefixed with an underscore are ignored + ignore_children: u64, + doc_id: String, } @@ -47,9 +63,11 @@ impl Shredder { keybuilder: KeyBuilder::new(), map: WordPathInfoMap::new(), path_array_offsets: Vec::new(), + ignore_children: 0, + doc_id: String::new(), } } - fn add_entries(&mut self, text: String, docseq: u64) { + fn add_entries(&mut self, text: &String, docseq: u64) { let stems = Stems::new(text.as_str()); for stem in stems { self.keybuilder.push_word(&stem.stemmed); @@ -80,54 +98,123 @@ impl Shredder { } } + // Extract key if it exists and indicates if it's a special type of key + fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { + if self.keybuilder.last_pushed_segment_type().unwrap() == SegmentType::ObjectKey { + match stack_element { + Some(StackElement::Key(key)) => { + if self.keybuilder.segments.len() == 1 && key.starts_with("_") { + if key == "_id" { + ObjectKeyTypes::Id + } else { + ObjectKeyTypes::Ignore + } + } else { + ObjectKeyTypes::Key(key.to_string()) + } + }, + _ => ObjectKeyTypes::NoKey, + } + } else { + ObjectKeyTypes::NoKey + } + } + + // If we are inside an object we need to push the key to the key builder + // Don't push them if they are reserved fields (starting with underscore) + fn maybe_push_key(&mut self, stack_element: Option) -> Result<(), Error> { + if self.keybuilder.last_pushed_segment_type().unwrap() == SegmentType::ObjectKey + && self.keybuilder.segments.len() == 1 { + if let Some(StackElement::Key(key)) = stack_element { + if key.starts_with("_") { + if key == "_id" { + return Err(Error::Shred( + "Expected string for `_id` field, got another type".to_string())); + } else { + self.ignore_children = 1; + } + } else { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.keybuilder.pop_object_key(); + self.keybuilder.push_object_key(key.to_string()); + } + } + } + Ok(()) + } - pub fn shred(&mut self, json: &str, docseq: u64) -> Result<&str, String> { + pub fn shred(&mut self, json: &str, docseq: u64) -> Result { let mut parser = Parser::new(json.chars()); let mut token = parser.next(); loop { // Get the next token, so that in case of an `ObjectStart` the key is already // on the stack. - let nexttoken = parser.next(); - match token.take() { Some(JsonEvent::ObjectStart) => { - match parser.stack().top() { - Some(StackElement::Key(key)) => { - self.keybuilder.push_object_key(key.to_string()); - self.inc_top_array_offset(); - }, - // We won't hit this case as we are within an object and not within - // an array - Some(StackElement::Index(_)) => {} - // It's an empty object - None => { - // Just push something to make `ObjectEnd` happy - self.keybuilder.push_object_key("".to_string()); - }, + if self.ignore_children > 0 { + self.ignore_children += 1; + } + else { + // Just push something to make `ObjectEnd` happy + self.keybuilder.push_object_key("".to_string()); } }, Some(JsonEvent::ObjectEnd) => { - self.keybuilder.pop_object_key(); + if self.ignore_children > 0 { + self.ignore_children -= 1; + } else {//if !self.keybuilder.segments.is_empty() { + self.keybuilder.pop_object_key(); + } }, Some(JsonEvent::ArrayStart) => { - self.keybuilder.push_array(); - self.path_array_offsets.push(0); + if self.ignore_children > 0 { + self.ignore_children += 1; + } else { + self.keybuilder.push_array(); + self.path_array_offsets.push(0); + } }, Some(JsonEvent::ArrayEnd) => { - self.path_array_offsets.pop(); - self.keybuilder.pop_array(); + if self.ignore_children > 0 { + self.ignore_children -= 1; + } else { + self.path_array_offsets.pop(); + self.keybuilder.pop_array(); + } }, Some(JsonEvent::StringValue(value)) => { - self.add_entries(value, docseq); - self.inc_top_array_offset(); + // No children to ignore + if self.ignore_children == 0 { + println!("stringvalue: {:?}", value); + match self.extract_key(parser.stack().top()) { + ObjectKeyTypes::Id => self.doc_id = value, + ObjectKeyTypes::Key(key) => { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.keybuilder.pop_object_key(); + self.keybuilder.push_object_key(key.to_string()); + + self.add_entries(&value, docseq); + self.inc_top_array_offset(); + }, + ObjectKeyTypes::NoKey => { + self.add_entries(&value, docseq); + self.inc_top_array_offset(); + }, + ObjectKeyTypes::Ignore => { + self.ignore_children = 1; + }, + } + } }, not_implemented => { panic!("Not yet implemented other JSON types! {:?}", not_implemented); } }; - token = nexttoken; + token = parser.next(); if token == None { break; } @@ -137,10 +224,10 @@ impl Shredder { for key in self.map.keys() { println!(" {}", key); } - Ok(&"thedocid") + Ok(self.doc_id.clone()) } - pub fn add_to_batch(&self, batch: &rocksdb::WriteBatch) -> Result<(), String> { + pub fn add_to_batch(&self, batch: &rocksdb::WriteBatch) -> Result<(), Error> { for (key_path, word_path_infos) in &self.map { let mut message = ::capnp::message::Builder::new_default(); { diff --git a/src/key_builder.rs b/src/key_builder.rs index 1c208a5..b628431 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -9,7 +9,7 @@ pub enum SegmentType { } #[derive(Debug, Clone)] -struct Segment { +pub struct Segment { type_: SegmentType, offset: usize, } @@ -17,7 +17,7 @@ struct Segment { #[derive(Debug, Clone)] pub struct KeyBuilder { pub array_depth: usize, - segments: Vec, + pub segments: Vec, fullkey: String, } diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..eecb1de --- /dev/null +++ b/src/main.rs @@ -0,0 +1,18 @@ +extern crate noise; + +use noise::index::{Index, OpenOptions}; +use noise::query::Query; + +fn main() { + let dbname = "querytestdb"; + Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + index.add(r#"{"_id": "foo", "hello": "world"}"#); + index.flush().unwrap(); + + let mut query_results = Query::get_matches(r#"hello="world""#.to_string(), &index).unwrap(); + //let mut query_results = Query::get_matches(r#"a.b[foo="bar"]"#.to_string(), &index).unwrap(); + println!("query results: {:?}", query_results.get_next_id()); +} diff --git a/src/query.rs b/src/query.rs index ae776c7..af81c9b 100644 --- a/src/query.rs +++ b/src/query.rs @@ -16,6 +16,7 @@ use records_capnp::payload; #[derive(Debug)] pub enum Error { Parse(String), + Shred(String), Capnp(capnp::Error), Rocks(rocksdb::Error), } @@ -24,6 +25,7 @@ impl error::Error for Error { fn description(&self) -> &str { match *self { Error::Parse(ref description) => description, + Error::Shred(ref description) => description, Error::Capnp(ref err) => err.description(), // XXX vmx 2016-11-07: It should be fixed on the RocksDB wrapper // that it has the std::error:Error implemented and hence @@ -35,6 +37,7 @@ impl error::Error for Error { fn cause(&self) -> Option<&error::Error> { match *self { Error::Parse(_) => None, + Error::Shred(_) => None, Error::Capnp(ref err) => Some(err as &error::Error), // NOTE vmx 2016-11-07: Looks like the RocksDB Wrapper needs to be // patched to be based on the std::error::Error trait @@ -59,6 +62,7 @@ impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Error::Parse(ref err) => write!(f, "Parse error: {}", err), + Error::Shred(ref err) => write!(f, "Shred error: {}", err), Error::Capnp(ref err) => write!(f, "Capnproto error: {}", err), Error::Rocks(ref err) => write!(f, "RocksDB error: {}", err), } From 48c96c5f4b1c18eb01cf20d1bce2d89ef5dd4eca Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 8 Nov 2016 20:12:11 -0800 Subject: [PATCH 012/122] Use rust-rocksdb fork based on Spacejams one. The plan is to get those changes upstream. --- Cargo.toml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 61c5452..374a7c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,10 +17,8 @@ stemmer = "0.3.2" unicode-normalization = "0.1.2" [dependencies.rocksdb] -#git = "https://github.com/vmx/rust-rocksdb.git" -#branch = "vmx" -path = "../spacejam-rocksdb" -#git = "https://github.com/spacejam/rust-rocksdb.git" +git = "https://github.com/vmx/rust-rocksdb.git" +branch = "noise-spacejam" [dependencies.unicode-segmentation] git = "https://github.com/vmx/unicode-segmentation.git" From 8c650af9b2223ecc4ac4279dbc765944233ed6a0 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 8 Nov 2016 21:07:45 -0800 Subject: [PATCH 013/122] Fix tests --- src/json_shred.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 828dae3..4d9284b 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -169,6 +169,7 @@ impl Shredder { } }, Some(JsonEvent::ArrayStart) => { + try!(self.maybe_push_key(parser.stack().top())); if self.ignore_children > 0 { self.ignore_children += 1; } else { @@ -311,18 +312,18 @@ mod tests { (vec![0], vec![ WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }])]), - ("W.A$.B!c2#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }, - WordInfo { - stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }])]), ("W.A$.B!three#1234", vec![ (vec![0], vec![WordInfo { stemmed_offset: 10, suffix_text: "".to_string(), suffix_offset: 15 }])]), ("W.A$.B!two#1234", vec![ (vec![0], vec![WordInfo { stemmed_offset: 6, suffix_text: "".to_string(), suffix_offset: 9 }])]), + ("W.A$.C!c2#1234", vec![ + (vec![0], vec![ + WordInfo { + stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }, + WordInfo { + stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }])]), ]; compare_shredded(&shredder.map, &expected); } From 6348c6f5970dec16924eb498cabc6e562fc04d3b Mon Sep 17 00:00:00 2001 From: Michael Nitschinger Date: Mon, 7 Nov 2016 21:32:11 -0800 Subject: [PATCH 014/122] Rewrite common_prefix_len to use idiomatic iterators. --- src/stems.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/stems.rs b/src/stems.rs index 5946509..c8a0f1b 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -37,14 +37,10 @@ impl<'a> Stems<'a> { /// Return the *byte* length of the common prefix between two strings fn common_prefix_len(aa: &str, bb: &str) -> usize { - let mut count = 0; - for (charsa, charsb) in aa.chars().zip(bb.chars()) { - if charsa != charsb { - break; - } - count += charsa.len_utf8(); - } - count + aa.chars() + .zip(bb.chars()) + .take_while(|&(a, b)| a == b) + .fold(0, |acc, (a, _)| acc + a.len_utf8()) } } From 39f6ec2aafd793066a80a456dba46d55b4697312 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Wed, 9 Nov 2016 10:34:21 -0800 Subject: [PATCH 015/122] Finish simple empty object test --- src/json_shred.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 4d9284b..6268b6f 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -350,6 +350,6 @@ mod tests { let json = r#"{}"#; let docseq = 123; shredder.shred(json, docseq).unwrap(); - // TODO vmx 2016-11-08: Finish this test + assert!(shredder.map.is_empty()); } } From d5b1b1cd6fb234fb5de07ae8e61ef126de9e680a Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Wed, 9 Nov 2016 10:34:48 -0800 Subject: [PATCH 016/122] Fixed typo --- src/json_shred.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 6268b6f..4bcd0a7 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -330,7 +330,7 @@ mod tests { fn compare_shredded(result_map: &WordPathInfoMap, expected: &Vec<(&str, Vec<(Vec, Vec)>)>) { - // HashMap have an arbitraty order of the elements + // HashMap have an arbitrary order of the elements let mut result: Vec<(&String, &ArrayOffsetsToWordInfo)> = result_map.into_iter().collect(); result.sort_by(|a, b| Ord::cmp(&a.0, &b.0)); for (ii, &(key, values)) in result.iter().enumerate() { From 7b5f571a4b86358fd6d3808cf3719bdb02de2ed2 Mon Sep 17 00:00:00 2001 From: Michael Nitschinger Date: Thu, 10 Nov 2016 03:47:22 -0800 Subject: [PATCH 017/122] Keep src dir clean from test data --- src/index.rs | 2 +- src/query.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index.rs b/src/index.rs index e1b7cc1..47c85db 100644 --- a/src/index.rs +++ b/src/index.rs @@ -179,7 +179,7 @@ mod tests { fn test_open() { let mut index = Index::new(); //let db = super::Index::open("firstnoisedb", Option::None).unwrap(); - index.open("firstnoisedb", Some(OpenOptions::Create)).unwrap(); + index.open("target/tests/firstnoisedb", Some(OpenOptions::Create)).unwrap(); index.flush().unwrap(); } } diff --git a/src/query.rs b/src/query.rs index af81c9b..9e1d142 100644 --- a/src/query.rs +++ b/src/query.rs @@ -573,7 +573,7 @@ mod tests { #[test] fn test_whitespace() { let mut index = Index::new(); - index.open("test_whitespace", Some(OpenOptions::Create)).unwrap(); + index.open("target/tests/test_whitespace", Some(OpenOptions::Create)).unwrap(); let rocks = &index.rocks.unwrap(); let mut snapshot = Snapshot::new(rocks); From 6f076e4cac94204517d174f9b6c92b9cbba0bf4a Mon Sep 17 00:00:00 2001 From: Michael Nitschinger Date: Thu, 10 Nov 2016 03:49:16 -0800 Subject: [PATCH 018/122] ignore intellij project files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a9d37c5..0aa91de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ target Cargo.lock + +**/*.iml +.idea/ From d08f767a021c0bac37281ada57d0a1dd3fc82d4c Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Wed, 9 Nov 2016 10:43:32 -0800 Subject: [PATCH 019/122] Move Error struct into its own module --- src/index.rs | 3 +-- src/json_shred.rs | 2 +- src/lib.rs | 1 + src/query.rs | 58 ++--------------------------------------------- 4 files changed, 5 insertions(+), 59 deletions(-) diff --git a/src/index.rs b/src/index.rs index 47c85db..eb1765d 100644 --- a/src/index.rs +++ b/src/index.rs @@ -6,9 +6,8 @@ use records_capnp::header; // Needed for a trait in order to `dekete/put()` into a `rocksdb::WriteBatch` use self::rocksdb::Writable; +use error::Error; use json_shred::{Shredder}; -// TODO vmx 2016-11-07: Move errors into their own module -use query::Error; const NOISE_HEADER_VERSION: u64 = 1; diff --git a/src/json_shred.rs b/src/json_shred.rs index 4bcd0a7..93c5755 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -7,10 +7,10 @@ use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; // Needed for a trait in order to `put()` into a `rocksdb::WriteBatch` use self::rocksdb::Writable; +use error::Error; use key_builder::{KeyBuilder, SegmentType}; use records_capnp::payload; use stems::Stems; -use query::{Error}; // Good example of using rustc_serialize: https://github.com/ajroetker/beautician/blob/master/src/lib.rs // Callback based JSON streaming parser: https://github.com/gyscos/json-streamer.rs diff --git a/src/lib.rs b/src/lib.rs index 22ffd37..96d690a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ extern crate capnp; extern crate rocksdb; +mod error; mod json_shred; mod key_builder; mod stems; diff --git a/src/query.rs b/src/query.rs index 9e1d142..c5722c6 100644 --- a/src/query.rs +++ b/src/query.rs @@ -2,8 +2,9 @@ #![allow(unused_variables)] extern crate capnp; -use std::{error, fmt, str}; +use std::str; +use error::Error; use index::Index; use key_builder::KeyBuilder; use stems::{StemmedWord, Stems}; @@ -13,61 +14,6 @@ use rocksdb::{self, DBIterator, IteratorMode, Snapshot}; use records_capnp::payload; -#[derive(Debug)] -pub enum Error { - Parse(String), - Shred(String), - Capnp(capnp::Error), - Rocks(rocksdb::Error), -} - -impl error::Error for Error { - fn description(&self) -> &str { - match *self { - Error::Parse(ref description) => description, - Error::Shred(ref description) => description, - Error::Capnp(ref err) => err.description(), - // XXX vmx 2016-11-07: It should be fixed on the RocksDB wrapper - // that it has the std::error:Error implemented and hence - // and err.description() - Error::Rocks(_) => "This is an rocksdb error", - } - } - - fn cause(&self) -> Option<&error::Error> { - match *self { - Error::Parse(_) => None, - Error::Shred(_) => None, - Error::Capnp(ref err) => Some(err as &error::Error), - // NOTE vmx 2016-11-07: Looks like the RocksDB Wrapper needs to be - // patched to be based on the std::error::Error trait - Error::Rocks(ref err) => None, - } - } -} - -impl From for Error { - fn from(err: capnp::Error) -> Error { - Error::Capnp(err) - } -} - -impl From for Error { - fn from(err: rocksdb::Error) -> Error { - Error::Rocks(err) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Error::Parse(ref err) => write!(f, "Parse error: {}", err), - Error::Shred(ref err) => write!(f, "Shred error: {}", err), - Error::Capnp(ref err) => write!(f, "Capnproto error: {}", err), - Error::Rocks(ref err) => write!(f, "RocksDB error: {}", err), - } - } -} pub struct DocResult { From b790f6119877cfe7048fc3e79b2d85d0de6fbdad Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Thu, 10 Nov 2016 13:54:55 -0800 Subject: [PATCH 020/122] Forgot to check in the error.rs --- src/error.rs | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 src/error.rs diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..62af691 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,60 @@ +extern crate capnp; +extern crate rocksdb; + +use std::{error, fmt}; + +#[derive(Debug)] +pub enum Error { + Parse(String), + Shred(String), + Capnp(capnp::Error), + Rocks(rocksdb::Error), +} + +impl error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Parse(ref description) => description, + Error::Shred(ref description) => description, + Error::Capnp(ref err) => err.description(), + // XXX vmx 2016-11-07: It should be fixed on the RocksDB wrapper + // that it has the std::error:Error implemented and hence + // and err.description() + Error::Rocks(_) => "This is an rocksdb error", + } + } + + fn cause(&self) -> Option<&error::Error> { + match *self { + Error::Parse(_) => None, + Error::Shred(_) => None, + Error::Capnp(ref err) => Some(err as &error::Error), + // NOTE vmx 2016-11-07: Looks like the RocksDB Wrapper needs to be + // patched to be based on the std::error::Error trait + Error::Rocks(_) => None, + } + } +} + +impl From for Error { + fn from(err: capnp::Error) -> Error { + Error::Capnp(err) + } +} + +impl From for Error { + fn from(err: rocksdb::Error) -> Error { + Error::Rocks(err) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Parse(ref err) => write!(f, "Parse error: {}", err), + Error::Shred(ref err) => write!(f, "Shred error: {}", err), + Error::Capnp(ref err) => write!(f, "Capnproto error: {}", err), + Error::Rocks(ref err) => write!(f, "RocksDB error: {}", err), + } + } +} From 03d5848bc17236df01431325b5278c16e9794dd1 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 10 Nov 2016 17:19:38 -0800 Subject: [PATCH 021/122] Changes to word stemming We now return the unstemmable characters in the prefix of a string as there own stemmed word. and do not lowercase the suffix characters. --- src/stems.rs | 83 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 18 deletions(-) diff --git a/src/stems.rs b/src/stems.rs index c8a0f1b..a973ca9 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -2,6 +2,7 @@ extern crate stemmer; extern crate unicode_normalization; extern crate unicode_segmentation; +use std::iter::Peekable; use self::stemmer::Stemmer; use self::unicode_normalization::UnicodeNormalization; @@ -9,7 +10,7 @@ use self::unicode_segmentation::UnicodeSegmentation; pub struct Stems<'a> { - words: unicode_segmentation::UnicodeWordIndices<'a>, + words: Peekable>, stemmer: Stemmer, } @@ -30,7 +31,7 @@ pub struct StemmedWord { impl<'a> Stems<'a> { pub fn new(text: &str) -> Stems { Stems{ - words: text.unicode_word_indices(), + words: text.split_word_bound_indices().peekable(), stemmer: Stemmer::new("english").unwrap(), } } @@ -48,24 +49,70 @@ impl<'a> Iterator for Stems<'a> { type Item = StemmedWord; fn next(&mut self) -> Option { - match self.words.next() { - Some((pos, word)) => { - let lowercased = word.to_lowercase(); - let normalized = lowercased.nfkc().collect::(); - let stemmed = self.stemmer.stem(&normalized); - let prefix_len = Stems::common_prefix_len(&stemmed, &normalized); - let ret = StemmedWord { - stemmed_offset: pos, - suffix_offset: pos + prefix_len, - stemmed: stemmed, - suffix: (&normalized[prefix_len..normalized.len()]).to_string(), - }; - Some(ret) - }, - None => None + let mut word_to_stem = String::new(); + let mut stemmed_offset = 0; + let mut normalized = String::new(); + loop { + match self.words.peek() { + Some(&(pos, word)) => { + normalized = word.nfkc().collect::(); + if word.chars().next().unwrap().is_alphabetic() { + stemmed_offset = pos; + break; + } else { + word_to_stem.push_str(&normalized); + self.words.next(); + } + }, + None => { + if word_to_stem.is_empty() { + return None; + } else { + break; + } + }, + } } - } + if !word_to_stem.is_empty() { + // we found the begining of the string is not a stemmable word. + // Return the accumulated string as the stemmed word + return Some(StemmedWord { + stemmed_offset: 0, + suffix_offset: word_to_stem.len(), + stemmed: word_to_stem, + suffix: String::new(), + }); + } + // normalized contains our stemmable word. advance the iter since we only peeked. + self.words.next(); + word_to_stem = normalized; + let mut suffix = word_to_stem.clone(); + loop { + // loop through all non-alphabetic chars and add to suffix (which means the suffix starts + // before the stemmed word) + match self.words.peek() { + Some(&(_pos, word)) => { + normalized = word.nfkc().collect::(); + if normalized.chars().next().unwrap().is_alphabetic() { + break; + } else { + suffix.push_str(&normalized); + self.words.next(); + } + }, + None => break, + } + } + let stemmed = self.stemmer.stem(&word_to_stem.to_lowercase()); + let prefix_len = Stems::common_prefix_len(&stemmed, &suffix); + Some(StemmedWord { + stemmed_offset: stemmed_offset, + suffix_offset: stemmed_offset + prefix_len, + stemmed: stemmed, + suffix: (&suffix[prefix_len..]).to_string(), + }) + } } From 867964bbb32d1dedd7e384f766983675dc6f2297 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 11 Nov 2016 09:31:05 -0800 Subject: [PATCH 022/122] Fixed broken tests in stems.rs still one broken --- src/stems.rs | 73 +++++++++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 47 deletions(-) diff --git a/src/stems.rs b/src/stems.rs index a973ca9..0bed3a6 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -120,53 +120,26 @@ impl<'a> Iterator for Stems<'a> { mod tests { use super::{StemmedWord, Stems}; - #[test] - fn test_stems_lowercase() { - let input = "These words deeply test smoothly that stemming"; - let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 5, - stemmed: String::from("these"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 6, suffix_offset: 10, - stemmed: String::from("word"), suffix: String::from("s") }, - // "deeply" stems to "deepli" - StemmedWord { stemmed_offset: 12, suffix_offset: 17, - stemmed: String::from("deepli"), suffix: String::from("y") }, - StemmedWord { stemmed_offset: 19, suffix_offset: 23, - stemmed: String::from("test"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 24, suffix_offset: 30, - stemmed: String::from("smooth"), suffix: String::from("ly") }, - StemmedWord { stemmed_offset: 33, suffix_offset: 37, - stemmed: String::from("that"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 38, suffix_offset: 42, - stemmed: String::from("stem"), suffix: String::from("ming") }, - ]; - assert_eq!(result.len(), expected.len()); - for (stem, expected_stem) in result.iter().zip(expected.iter()) { - assert_eq!(stem, expected_stem); - } - } - #[test] fn test_stems_mixedcase() { let input = "THEse Words deeplY test smOOthly that stemmING"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 5, - stemmed: String::from("these"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 6, suffix_offset: 10, - stemmed: String::from("word"), suffix: String::from("s") }, + StemmedWord { stemmed_offset: 0, suffix_offset: 0, + stemmed: String::from("these"), suffix: String::from("THEse ") }, + StemmedWord { stemmed_offset: 6, suffix_offset: 6, + stemmed: String::from("word"), suffix: String::from("Words ") }, // "deeply" stems to "deepli" StemmedWord { stemmed_offset: 12, suffix_offset: 17, - stemmed: String::from("deepli"), suffix: String::from("y") }, + stemmed: String::from("deepli"), suffix: String::from("Y ") }, StemmedWord { stemmed_offset: 19, suffix_offset: 23, - stemmed: String::from("test"), suffix: String::from("") }, - StemmedWord { stemmed_offset: 24, suffix_offset: 30, - stemmed: String::from("smooth"), suffix: String::from("ly") }, + stemmed: String::from("test"), suffix: String::from(" ") }, + StemmedWord { stemmed_offset: 24, suffix_offset: 26, + stemmed: String::from("smooth"), suffix: String::from("OOthly ") }, StemmedWord { stemmed_offset: 33, suffix_offset: 37, - stemmed: String::from("that"), suffix: String::from("") }, + stemmed: String::from("that"), suffix: String::from(" ") }, StemmedWord { stemmed_offset: 38, suffix_offset: 42, - stemmed: String::from("stem"), suffix: String::from("ming") }, + stemmed: String::from("stem"), suffix: String::from("mING") }, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { @@ -178,7 +151,10 @@ mod tests { fn test_stems_nonchars() { let input = " @#$!== \t+-"; let result = Stems::new(input).collect::>(); - assert_eq!(result.len(), 0); + assert_eq!(result, vec![ + StemmedWord { stemmed_offset: 0, suffix_offset: 12, + stemmed: String::from(" @#$!== \t+-"), suffix: String::from("") }, + ]); } #[test] @@ -186,10 +162,12 @@ mod tests { let input = "@!? Let's seeing..."; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 6, suffix_offset: 9, - stemmed: String::from("let"), suffix: String::from("'s") }, + StemmedWord { stemmed_offset: 0, suffix_offset: 6, + stemmed: String::from("@!? "), suffix: String::from("") }, + StemmedWord { stemmed_offset: 6, suffix_offset: 6, + stemmed: String::from("let"), suffix: String::from("Let's ") }, StemmedWord { stemmed_offset: 12, suffix_offset: 15, - stemmed: String::from("see"), suffix: String::from("ing") }, + stemmed: String::from("see"), suffix: String::from("ing...") }, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { @@ -202,8 +180,8 @@ mod tests { let input = "Ünicöde stemming"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 8, - stemmed: String::from("ünicöd"), suffix: String::from("e") }, + StemmedWord { stemmed_offset: 0, suffix_offset: 0, + stemmed: String::from("ünicöd"), suffix: String::from("Ünicöde ") }, StemmedWord { stemmed_offset: 10, suffix_offset: 14, stemmed: String::from("stem"), suffix: String::from("ming") }, ]; @@ -218,8 +196,8 @@ mod tests { let input = "İ"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 3, - stemmed: String::from("i̇"), suffix: String::from("") }, + StemmedWord { stemmed_offset: 0, suffix_offset: 0, + stemmed: String::from("i̇"), suffix: String::from("İ") }, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { @@ -253,8 +231,9 @@ mod tests { let input = "\u{03A1}\u{0313}\u{03C1}\u{0313}\u{1FE4}"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 9, - stemmed: String::from("\u{1FE4}\u{1FE4}\u{1FE4}"), suffix: String::from("") }, + StemmedWord { stemmed_offset: 0, suffix_offset: 0, + stemmed: String::from("\u{1FE4}\u{1FE4}\u{1FE4}"), + suffix: String::from("\u{03A1}\u{0313}\u{03C1}\u{0313}\u{03C1}") }, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { From 4ad195dd168ac7a54ab0aa47f1631636be4bad89 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 11 Nov 2016 09:42:26 -0800 Subject: [PATCH 023/122] Got rid of compile warnings of unused result --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index eecb1de..f4bcf49 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,11 +5,11 @@ use noise::query::Query; fn main() { let dbname = "querytestdb"; - Index::delete(dbname); + let _ = Index::delete(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - index.add(r#"{"_id": "foo", "hello": "world"}"#); + let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#); index.flush().unwrap(); let mut query_results = Query::get_matches(r#"hello="world""#.to_string(), &index).unwrap(); From 40ff067547266dd624662abbb4b014b446e22dd9 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 11 Nov 2016 09:58:39 -0800 Subject: [PATCH 024/122] Strip off keyspace byte from get_next_id result string --- src/query.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query.rs b/src/query.rs index c5722c6..df87784 100644 --- a/src/query.rs +++ b/src/query.rs @@ -100,8 +100,8 @@ impl<'a> QueryResults<'a> { Some(seq) => { let key = format!("S{}", seq); match try!(self.snapshot.get(&key.as_bytes())) { - // If there is an id, it's UTF-8 - Some(id) => Ok(Some(id.to_utf8().unwrap().to_string())), + // If there is an id, it's UTF-8. Strip off keyspace leading byte + Some(id) => Ok(Some(id.to_utf8().unwrap()[1..].to_string())), None => Ok(None) } }, From 3b478b76dde43803acf7c451b6b2a40757f7a529 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 11 Nov 2016 10:17:48 -0800 Subject: [PATCH 025/122] Simplify whitespace method in query parser --- src/query.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/query.rs b/src/query.rs index df87784..10e710d 100644 --- a/src/query.rs +++ b/src/query.rs @@ -321,22 +321,18 @@ impl<'a> Parser<'a> { } fn whitespace(&mut self) { - loop { - match self.query[self.offset..].chars().next() { - Some(char) => { - if !char.is_whitespace() { - break; - } - self.offset += char.len_utf8(); - }, - None => break, + for char in self.query[self.offset..].chars() { + if !char.is_whitespace() { + break; } + self.offset += char.len_utf8(); } } fn consume(&mut self, token: &str) -> bool { if self.could_consume(token) { self.offset += token.len(); + self.whitespace(); true } else { false From 1efbe374847d22eba89d52c160490da78d8ddd1b Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 11 Nov 2016 13:54:13 -0800 Subject: [PATCH 026/122] Added support for escaping special chars in string literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit support for \\ \” \n \b \r \v \t \f which is similar to javascript --- src/query.rs | 48 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/src/query.rs b/src/query.rs index 10e710d..911ec42 100644 --- a/src/query.rs +++ b/src/query.rs @@ -362,18 +362,40 @@ impl<'a> Parser<'a> { } fn consume_string_literal(&mut self) -> Result, Error> { - // Does not unescape yet let mut lit = String::new(); - if self.consume("\"") { + let mut next_is_special_char = false; + if self.could_consume("\"") { + // can't consume("\"") the leading quote because it will also skip leading whitespace + // inside the string literal + self.offset += 1; for char in self.query[self.offset..].chars() { - if char == '"' { - break; + if next_is_special_char { + match char { + '\\' | '"' => lit.push(char), + 'n' => lit.push('\n'), + 'b' => lit.push('\x08'), + 'r' => lit.push('\r'), + 'f' => lit.push('\x0C'), + 't' => lit.push('\t'), + 'v' => lit.push('\x0B'), + _ => return Err(Error::Parse(format!("Unknown character escape: {}", + char))), + }; + self.offset += 1; + next_is_special_char = false; + } else { + if char == '"' { + break; + } else if char == '\\' { + next_is_special_char = true; + self.offset += 1; + } else { + lit.push(char); + self.offset += char.len_utf8(); + } } - lit.push(char); - self.offset += char.len_utf8(); } if self.consume("\"") { - self.whitespace(); Ok(Some(lit)) } else { Err(Error::Parse("Expected \"".to_string())) @@ -530,4 +552,16 @@ mod tests { parser.whitespace(); assert_eq!(parser.offset, 0); } + + #[test] + fn test_consume_string_literal() { + let mut index = Index::new(); + index.open("target/tests/test_consume_string_literal", Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + let snapshot = Snapshot::new(rocks); + + let query = r#"" \n \t test""#.to_string(); + let mut parser = Parser::new(query, snapshot); + assert_eq!(parser.consume_string_literal().unwrap().unwrap(), " \n \t test".to_string()); + } } From 2d854ff6cbf34fc745d9600966a99bf5f00d41b3 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Fri, 11 Nov 2016 14:28:06 -0800 Subject: [PATCH 027/122] Fix bug when array ends --- src/query.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query.rs b/src/query.rs index 911ec42..627a7ee 100644 --- a/src/query.rs +++ b/src/query.rs @@ -431,7 +431,7 @@ impl<'a> Parser<'a> { self.kb.push_array(); let filter = try!(self.bool()); self.kb.pop_array(); - if self.consume("]") { + if !self.consume("]") { return Err(Error::Parse("Expected ']'".to_string())); } Ok(filter) From 3dd1e3adc8d6a2118005dea14869a8afb52f2a85 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Fri, 11 Nov 2016 16:41:07 -0800 Subject: [PATCH 028/122] Make queries work --- src/json_shred.rs | 5 ++- src/query.rs | 82 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 93c5755..eea3628 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -123,10 +123,9 @@ impl Shredder { // If we are inside an object we need to push the key to the key builder // Don't push them if they are reserved fields (starting with underscore) fn maybe_push_key(&mut self, stack_element: Option) -> Result<(), Error> { - if self.keybuilder.last_pushed_segment_type().unwrap() == SegmentType::ObjectKey - && self.keybuilder.segments.len() == 1 { + if self.keybuilder.last_pushed_segment_type().unwrap() == SegmentType::ObjectKey { if let Some(StackElement::Key(key)) = stack_element { - if key.starts_with("_") { + if self.keybuilder.segments.len() == 1 && key.starts_with("_") { if key == "_id" { return Err(Error::Shred( "Expected string for `_id` field, got another type".to_string())); diff --git a/src/query.rs b/src/query.rs index 627a7ee..7b293cc 100644 --- a/src/query.rs +++ b/src/query.rs @@ -206,7 +206,6 @@ impl QueryRuntimeFilter for ExactMatchFilter { } } } - self.iter.next(); if doc_result.seq > 0 { return Ok(Some(doc_result)); @@ -255,7 +254,7 @@ impl<'a> AndFilter<'a> { base_result.truncate_array_paths(self.array_depth); self.current_filter += 1; - if self.current_filter > self.filters.len() { + if self.current_filter == self.filters.len() { self.current_filter = 0; } @@ -271,6 +270,9 @@ impl<'a> AndFilter<'a> { Some(new_result) => { base_result = new_result; matches_count -= 1; + if matches_count == 0 { + return Ok(Some(base_result)); + } }, None => { let new_result = try!(self.filters[self.current_filter].first_result(base_result.seq)); @@ -528,7 +530,7 @@ impl Query { #[cfg(test)] mod tests { - use super::Parser; + use super::{Parser, Query}; use index::{Index, OpenOptions}; @@ -564,4 +566,78 @@ mod tests { let mut parser = Parser::new(query, snapshot); assert_eq!(parser.consume_string_literal().unwrap().unwrap(), " \n \t test".to_string()); } + + #[test] + fn test_query_hello_world() { + let dbname = "target/tests/querytestdbhelloworld"; + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#); + index.flush().unwrap(); + + let mut query_results = Query::get_matches(r#"hello="world""#.to_string(), &index).unwrap(); + //let mut query_results = Query::get_matches(r#"a.b[foo="bar"]"#.to_string(), &index).unwrap(); + println!("query results: {:?}", query_results.get_next_id()); + } + + #[test] + fn test_query_basic() { + let dbname = "target/tests/querytestdbbasic"; + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let _ = index.add(r#"{"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}"#); + let _ = index.add(r#"{"_id":"2", "A":[{"B":"B2","C":[{"D":"D"}]},{"B": "b1","C":"C2"}]}"#); + let _ = index.add(r#"{"_id":"3", "A":"Multi word sentence"}"#); + let _ = index.add(r#"{"_id":"4", "A":"%&%}{}@);€"}"#); + let _ = index.add(r#"{"_id":"5", "A":"{}€52 deeply \\n\\v "}"#); + + index.flush().unwrap(); + + let mut query_results = Query::get_matches(r#"A[B = "B2" & C[ D = "D" ]]"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); + + query_results = Query::get_matches(r#"A[B = "B2" & C = "C2"]"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); + + query_results = Query::get_matches(r#"A[B = "b1" & C = "C2"]"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); + + query_results = Query::get_matches(r#"A = "Multi word sentence""#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); + + query_results = Query::get_matches(r#"A = "%&%}{}@);€""#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("4".to_string())); + + query_results = Query::get_matches(r#"A = "{}€52 deeply \\n\\v ""#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("5".to_string())); + } + + #[test] + fn test_query_more_docs() { + let dbname = "target/tests/querytestdbmoredocs"; + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + for ii in 1..100 { + let data = ((ii % 25) + 97) as u8 as char; + let _ = index.add(&format!(r#"{{"_id":"{}", "data": "{}"}}"#, ii, data)); + } + index.flush().unwrap(); + + let mut query_results = Query::get_matches(r#"data = "u""#.to_string(), &index).unwrap(); + loop { + match query_results.get_next_id() { + Ok(Some(result)) => println!("result: {}", result), + Ok(None) => break, + Err(error) => panic!(error), + } + } + } } From e53d5576c259babc24b522284d348da6d5d239a4 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Fri, 11 Nov 2016 16:56:52 -0800 Subject: [PATCH 029/122] Remove commented out code --- src/query.rs | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/src/query.rs b/src/query.rs index 7b293cc..bf0047c 100644 --- a/src/query.rs +++ b/src/query.rs @@ -55,9 +55,6 @@ impl DocResult { } } -//trait QueryRuntimeFilter { -//struct QueryRuntimeFilter {} - pub trait QueryRuntimeFilter { fn first_result(&mut self, start_id: u64) -> Result, Error>; fn next_result(&mut self) -> Result, Error>; @@ -110,24 +107,6 @@ impl<'a> QueryResults<'a> { } } -//struct SnapshotIteratorCreator { -// snapshot: rocksdb::Snapshot, -//} -// -//impl SnapshotIteratorCreator { -// fn new(db: &rocksdb::DB) { -// let snapshot = rocksdb::Snapshot::new(db); -// SnapshotIteratorCreator{ -// snapshot: snapshot, -// } -// } -// -// fn new_iterator(&self) { -// self.snapshot.iter() -// } -//} - - struct ExactMatchFilter { iter: DBIterator, @@ -368,7 +347,7 @@ impl<'a> Parser<'a> { let mut next_is_special_char = false; if self.could_consume("\"") { // can't consume("\"") the leading quote because it will also skip leading whitespace - // inside the string literal + // inside the string literal self.offset += 1; for char in self.query[self.offset..].chars() { if next_is_special_char { From 456d498d6d38647e38ad66f31a76ee2c19b31783 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Fri, 11 Nov 2016 18:09:53 -0800 Subject: [PATCH 030/122] Tests work again --- src/json_shred.rs | 14 ++++++++++---- src/stems.rs | 4 ++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index eea3628..e416c81 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -310,19 +310,25 @@ mod tests { ("W.A$.B!b2vmx#1234", vec![ (vec![0], vec![ WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }])]), + stemmed_offset: 0, suffix_text: "B2VMX ".to_string(), suffix_offset: 0 }])]), ("W.A$.B!three#1234", vec![ (vec![0], vec![WordInfo { stemmed_offset: 10, suffix_text: "".to_string(), suffix_offset: 15 }])]), ("W.A$.B!two#1234", vec![ (vec![0], vec![WordInfo { - stemmed_offset: 6, suffix_text: "".to_string(), suffix_offset: 9 }])]), + stemmed_offset: 6, suffix_text: " ".to_string(), suffix_offset: 9 }])]), + ("W.A$.C!..#1234", vec![ + (vec![0], vec![ + WordInfo { + stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }, + WordInfo { + stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }])]), ("W.A$.C!c2#1234", vec![ (vec![0], vec![ WordInfo { - stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }, + stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }, WordInfo { - stemmed_offset: 2, suffix_text: "".to_string(), suffix_offset: 4 }])]), + stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }])]), ]; compare_shredded(&shredder.map, &expected); } diff --git a/src/stems.rs b/src/stems.rs index 0bed3a6..9bfe5ea 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -232,8 +232,8 @@ mod tests { let result = Stems::new(input).collect::>(); let expected = vec![ StemmedWord { stemmed_offset: 0, suffix_offset: 0, - stemmed: String::from("\u{1FE4}\u{1FE4}\u{1FE4}"), - suffix: String::from("\u{03A1}\u{0313}\u{03C1}\u{0313}\u{03C1}") }, + stemmed: String::from("\u{03C1}\u{0313}\u{1FE4}\u{1FE4}"), + suffix: String::from("\u{03A1}\u{0313}\u{1FE4}\u{1FE4}") }, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { From 98e332342806fb1d37d6e843133c87c2b2c5c370 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Sat, 12 Nov 2016 16:36:14 -0800 Subject: [PATCH 031/122] Switch to stock rust-rocksdb It's no longer needed to use my fork. The solution to move a field out of a borrowed struct is to define it as `Option`. --- Cargo.toml | 3 +-- src/index.rs | 27 +++++++++++++++++++-------- src/json_shred.rs | 4 ++-- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 374a7c9..9a6628b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,8 +17,7 @@ stemmer = "0.3.2" unicode-normalization = "0.1.2" [dependencies.rocksdb] -git = "https://github.com/vmx/rust-rocksdb.git" -branch = "noise-spacejam" +git = "https://github.com/spacejam/rust-rocksdb.git" [dependencies.unicode-segmentation] git = "https://github.com/vmx/unicode-segmentation.git" diff --git a/src/index.rs b/src/index.rs index eb1765d..236ac5a 100644 --- a/src/index.rs +++ b/src/index.rs @@ -42,7 +42,7 @@ pub struct Index { high_doc_seq: u64, pub rocks: Option, id_str_to_id_seq: HashMap, - batch: rocksdb::WriteBatch, + batch: Option, } pub enum OpenOptions { @@ -56,7 +56,7 @@ impl Index { high_doc_seq: 0, rocks: None, id_str_to_id_seq: HashMap::new(), - batch: rocksdb::WriteBatch::default(), + batch: Some(rocksdb::WriteBatch::default()), } } // NOTE vmx 2016-10-13: Perhpas the name should be specified on `new()` as it is bound @@ -131,7 +131,7 @@ impl Index { // TODO vmx 2016-10-17: USe multiget once the Rusts wrapper supports it match rocks.get(key.as_bytes()) { Ok(Some(seq)) => { - try!(self.batch.delete(&*seq)); + try!(self.batch().delete(&*seq)); }, _ => {} } @@ -139,16 +139,18 @@ impl Index { // Add the ids_to_seq keyspace entries for (id, seq) in &self.id_str_to_id_seq { - try!(self.batch.put(id.as_bytes(), seq.as_bytes())); - try!(self.batch.put(seq.as_bytes(), id.as_bytes())); + try!(self.batch().put(id.as_bytes(), seq.as_bytes())); + try!(self.batch().put(seq.as_bytes(), id.as_bytes())); } let mut header = Header::new(); header.high_seq = self.high_doc_seq; - try!(self.batch.put(b"HDB", &*header.serialize())); + try!(self.batch().put(b"HDB", &*header.serialize())); - let status = try!(rocks.write(&self.batch)); - self.batch.clear(); + let status = try!(rocks.write(self.batch.take().unwrap())); + // Make sure there's a always a valid WriteBarch after writing it into RocksDB, + // else calls to `self.batch()` would panic. + self.batch = Some(rocksdb::WriteBatch::default()); self.id_str_to_id_seq.clear(); Ok(status) } @@ -166,6 +168,15 @@ impl Index { None => Ok(None) } } + + /// Returns the current write batch as reference + /// + /// # Panics + /// + /// Panic if there currently is no `WriteBatch` (`self.batch == None`) + fn batch(&self) -> &rocksdb::WriteBatch { + self.batch.as_ref().unwrap() + } } diff --git a/src/json_shred.rs b/src/json_shred.rs index e416c81..6d334d1 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -227,7 +227,7 @@ impl Shredder { Ok(self.doc_id.clone()) } - pub fn add_to_batch(&self, batch: &rocksdb::WriteBatch) -> Result<(), Error> { + pub fn add_to_batch(&self, batch: &Option) -> Result<(), Error> { for (key_path, word_path_infos) in &self.map { let mut message = ::capnp::message::Builder::new_default(); { @@ -257,7 +257,7 @@ impl Shredder { } let mut bytes = Vec::new(); ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); - try!(batch.put(&key_path.clone().into_bytes(), &bytes)); + try!(batch.as_ref().unwrap().put(&key_path.clone().into_bytes(), &bytes)); } Ok(()) } From 6427ecab00eb77c46bfef037603bd0e262b69c47 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Sat, 12 Nov 2016 16:47:19 -0800 Subject: [PATCH 032/122] Stock unicode-segmentation can be used There's no longer the need to use my fork of `unicode-segmentation`. --- Cargo.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9a6628b..e82aa4a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,14 +15,11 @@ capnp = "0.7.4" rustc-serialize= "0.3.19" stemmer = "0.3.2" unicode-normalization = "0.1.2" +unicode-segmentation = "0.1.2" [dependencies.rocksdb] git = "https://github.com/spacejam/rust-rocksdb.git" -[dependencies.unicode-segmentation] -git = "https://github.com/vmx/unicode-segmentation.git" -branch = "unicode-word-indices" - [build-dependencies] capnpc = "0.7.2" From ba56cfea65b798eb19d8b0682f3df8e2b472f2c4 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 16 Nov 2016 19:37:55 -0800 Subject: [PATCH 033/122] added peg parser grammar to a comment THis helps document the structure of the recursive descent parser --- src/query.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/query.rs b/src/query.rs index bf0047c..08f0137 100644 --- a/src/query.rs +++ b/src/query.rs @@ -386,6 +386,26 @@ impl<'a> Parser<'a> { } } +/* +This is a peg grammar that documents the calls of the recursive descent parser +is implemented. Can be checked here: http://pegjs.org/online + +bool + = ws compare ws ('&' ws compare)* +compare + = field ('.' field)* ws '=' ws string* / factor +factor + = '(' ws bool ws ')' ws / array +array + = '[' ws bool ']' ws +field + = [a-z]i+ ws +string + = '"' ('\\\\' / '\\' [\"tfvrnb] / [^\\\"])* '"' ws +ws + = [ /\t/\r\n]* +*/ + fn bool<'b>(&'b mut self) -> Result, Error> { let left = try!(self.compare()); let mut filters = vec![left]; From 6058be2e902c0897aeed5022254b5cdff2fe3cb8 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 29 Nov 2016 13:21:37 -0800 Subject: [PATCH 034/122] Split up word index keys to be one key per nested repeated field This fixes #7. We delimit the array path entries with commas. --- capnp/records.capnp | 16 ++--- src/index.rs | 21 ++++++- src/json_shred.rs | 111 ++++++++++++++-------------------- src/key_builder.rs | 114 ++++++++++++++++++++++++++++++++++- src/query.rs | 142 +++++++++++++++++--------------------------- 5 files changed, 237 insertions(+), 167 deletions(-) diff --git a/capnp/records.capnp b/capnp/records.capnp index caaa5f1..6cbed40 100644 --- a/capnp/records.capnp +++ b/capnp/records.capnp @@ -6,16 +6,10 @@ struct Header { } struct Payload { - struct ArrayoffsetsToWordinfo { - struct Wordinfo { - stemmedOffset @0 :UInt64; - suffixOffset @1 :UInt64; - suffixText @2 :Text; - } - - arrayoffsets @0 :List(UInt64); - wordinfos @1 :List(Wordinfo); + struct Wordinfo { + stemmedOffset @0 :UInt64; + suffixOffset @1 :UInt64; + suffixText @2 :Text; } - - arrayoffsetsToWordinfos @0 :List(ArrayoffsetsToWordinfo); + wordinfos @0 :List(Wordinfo); } diff --git a/src/index.rs b/src/index.rs index 236ac5a..2f29270 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,6 +1,7 @@ extern crate rocksdb; use std::collections::HashMap; +use std::str; use records_capnp::header; // Needed for a trait in order to `dekete/put()` into a `rocksdb::WriteBatch` @@ -75,6 +76,7 @@ impl Index { } rocks_options.create_if_missing(true); + rocks_options.set_comparator("noise", Index::compare_keys); let rocks = try!(rocksdb::DB::open(&rocks_options, name)); @@ -111,11 +113,10 @@ impl Index { // NOTE vmx 2016-10-13: Needed for the lifetime-checker, though not sure if it now really // does the right thing. Does the `try!()` still return as epected? { - let docid = try!(shredder.shred(json, self.high_doc_seq + 1)); + let docid = try!(shredder.shred(json, self.high_doc_seq + 1, &self.batch())); self.high_doc_seq += 1; self.id_str_to_id_seq.insert(format!("I{}", docid), format!("S{}", self.high_doc_seq)); } - try!(shredder.add_to_batch(&self.batch)); Ok(()) } @@ -177,6 +178,22 @@ impl Index { fn batch(&self) -> &rocksdb::WriteBatch { self.batch.as_ref().unwrap() } + + fn compare_keys(a: &[u8], b: &[u8]) -> i32 { + use std::cmp::Ordering; + use key_builder::KeyBuilder; + if a[0] == 'W' as u8 && b[0] == 'W' as u8 { + let astr = unsafe {str::from_utf8_unchecked(&a)}; + let bstr = unsafe {str::from_utf8_unchecked(&b)}; + KeyBuilder::compare_keys(astr, bstr) + } else { + match a.cmp(b) { + Ordering::Less => -1, + Ordering::Greater => 1, + Ordering::Equal => 0, + } + } + } } diff --git a/src/json_shred.rs b/src/json_shred.rs index 6d334d1..2f1af53 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -31,9 +31,6 @@ struct WordInfo { } type ArrayOffsets = Vec; -type ArrayOffsetsToWordInfo = HashMap>; -type WordPathInfoMap = HashMap; - enum ObjectKeyTypes { /// _id field @@ -49,7 +46,6 @@ enum ObjectKeyTypes { #[derive(Debug)] pub struct Shredder { keybuilder: KeyBuilder, - map: WordPathInfoMap, path_array_offsets: ArrayOffsets, // Top-level fields prefixed with an underscore are ignored ignore_children: u64, @@ -61,32 +57,52 @@ impl Shredder { pub fn new() -> Shredder { Shredder{ keybuilder: KeyBuilder::new(), - map: WordPathInfoMap::new(), path_array_offsets: Vec::new(), ignore_children: 0, doc_id: String::new(), } } - fn add_entries(&mut self, text: &String, docseq: u64) { + + fn add_entries(&mut self, text: &String, docseq: u64, batch: &rocksdb::WriteBatch) -> Result<(), Error> { let stems = Stems::new(text.as_str()); + let mut word_to_word_infos = HashMap::new(); + for stem in stems { - self.keybuilder.push_word(&stem.stemmed); - self.keybuilder.push_doc_seq(docseq); - let map_path_array_offsets = self.map.entry(self.keybuilder.key()) - .or_insert(ArrayOffsetsToWordInfo::new()); - let map_word_infos = map_path_array_offsets.entry(self.path_array_offsets.clone()) - .or_insert(Vec::new()); - map_word_infos.push(WordInfo{ + let word_infos = word_to_word_infos.entry(stem.stemmed).or_insert(Vec::new()); + word_infos.push(WordInfo{ stemmed_offset: stem.stemmed_offset as u64, suffix_text: stem.suffix.to_string(), suffix_offset: stem.suffix_offset as u64, }); + } + for (stemmed, word_infos) in word_to_word_infos { + let mut message = ::capnp::message::Builder::new_default(); + { + let capn_payload = message.init_root::(); + let mut capn_wordinfos = capn_payload.init_wordinfos(word_infos.len() as u32); + for (pos, word_info) in word_infos.iter().enumerate() { + let mut capn_wordinfo = capn_wordinfos.borrow().get(pos as u32); + capn_wordinfo.set_stemmed_offset(word_info.stemmed_offset); + capn_wordinfo.set_suffix_text(&word_info.suffix_text); + capn_wordinfo.set_suffix_offset(word_info.suffix_offset); + } + } + self.keybuilder.push_word(&stemmed); + self.keybuilder.push_doc_seq(docseq); + self.keybuilder.push_array_path(&self.path_array_offsets); + + let mut bytes = Vec::new(); + ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); + try!(batch.put(&self.keybuilder.key().into_bytes(), &bytes)); + + self.keybuilder.pop_array_path(); self.keybuilder.pop_doc_seq(); self.keybuilder.pop_word(); } - println!("add_entries: map: {:?}", self.map); + Ok(()) } + fn inc_top_array_offset(&mut self) { // we encounter a new element. if we are a child element of an array // increment the offset. If we aren't (we are the root value or a map @@ -143,7 +159,7 @@ impl Shredder { Ok(()) } - pub fn shred(&mut self, json: &str, docseq: u64) -> Result { + pub fn shred(&mut self, json: &str, docseq: u64, batch: &rocksdb::WriteBatch) -> Result { let mut parser = Parser::new(json.chars()); let mut token = parser.next(); @@ -165,6 +181,7 @@ impl Shredder { self.ignore_children -= 1; } else {//if !self.keybuilder.segments.is_empty() { self.keybuilder.pop_object_key(); + self.inc_top_array_offset(); } }, Some(JsonEvent::ArrayStart) => { @@ -182,6 +199,7 @@ impl Shredder { } else { self.path_array_offsets.pop(); self.keybuilder.pop_array(); + self.inc_top_array_offset(); } }, Some(JsonEvent::StringValue(value)) => { @@ -196,11 +214,11 @@ impl Shredder { self.keybuilder.pop_object_key(); self.keybuilder.push_object_key(key.to_string()); - self.add_entries(&value, docseq); + try!(self.add_entries(&value, docseq, &batch)); self.inc_top_array_offset(); }, ObjectKeyTypes::NoKey => { - self.add_entries(&value, docseq); + try!(self.add_entries(&value, docseq, &batch)); self.inc_top_array_offset(); }, ObjectKeyTypes::Ignore => { @@ -219,56 +237,18 @@ impl Shredder { break; } } - println!("keybuilder: {}", self.keybuilder.key()); - println!("shredder: keys:"); - for key in self.map.keys() { - println!(" {}", key); - } Ok(self.doc_id.clone()) - } - - pub fn add_to_batch(&self, batch: &Option) -> Result<(), Error> { - for (key_path, word_path_infos) in &self.map { - let mut message = ::capnp::message::Builder::new_default(); - { - let capn_payload = message.init_root::(); - let mut capn_arrayoffsets_to_wordinfo = capn_payload.init_arrayoffsets_to_wordinfos( - word_path_infos.len() as u32); - for (infos_pos, (arrayoffsets, wordinfos)) in word_path_infos.iter().enumerate() { - - let mut capn_a2w = capn_arrayoffsets_to_wordinfo.borrow().get(infos_pos as u32); - { - let mut capn_arrayoffsets = - capn_a2w.borrow().init_arrayoffsets(arrayoffsets.len() as u32); - for (pos, arrayoffset) in arrayoffsets.iter().enumerate() { - capn_arrayoffsets.set(pos as u32, arrayoffset.clone()); - } - } - { - let mut capn_wordinfos = capn_a2w.init_wordinfos(wordinfos.len() as u32); - for (pos, wordinfo) in wordinfos.iter().enumerate() { - let mut capn_wordinfo = capn_wordinfos.borrow().get(pos as u32); - capn_wordinfo.set_stemmed_offset(wordinfo.stemmed_offset); - capn_wordinfo.set_suffix_text(&wordinfo.suffix_text); - capn_wordinfo.set_suffix_offset(wordinfo.suffix_offset); - } - } - } - } - let mut bytes = Vec::new(); - ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); - try!(batch.as_ref().unwrap().put(&key_path.clone().into_bytes(), &bytes)); - } - Ok(()) - } + } } +/* #[cfg(test)] mod tests { - use super::{ArrayOffsetsToWordInfo, WordInfo, WordPathInfoMap}; - + + use super::{WordInfo}; #[test] + fn test_shred_nested() { let mut shredder = super::Shredder::new(); //let json = r#"{"hello": {"my": "world!"}, "anumber": 2}"#; @@ -280,16 +260,16 @@ mod tests { let docseq = 123; shredder.shred(json, docseq).unwrap(); let expected = vec![ - ("W.some$!array#123", vec![ + ("W.some$!array#123,0", vec![ (vec![0], vec![WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }])]), - ("W.some$!data#123", vec![ + ("W.some$!data#123,1", vec![ (vec![1], vec![WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }])]), - ("W.some$$!also#123", vec![ + ("W.some$$!also#123,2,0", vec![ (vec![2, 0], vec![WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }])]), - ("W.some$$!nest#123", vec![ + ("W.some$$!nest#1232,1", vec![ (vec![2, 1], vec![WordInfo { stemmed_offset: 0, suffix_text: "ed".to_string(), suffix_offset: 4 }])]), ]; @@ -356,5 +336,6 @@ mod tests { let docseq = 123; shredder.shred(json, docseq).unwrap(); assert!(shredder.map.is_empty()); - } + } } +*/ diff --git a/src/key_builder.rs b/src/key_builder.rs index b628431..8c01473 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -1,3 +1,6 @@ +use query::DocResult; +use std::str; + //#[derive(PartialEq, Eq)] #[derive(Debug, Clone, PartialEq)] pub enum SegmentType { @@ -6,6 +9,7 @@ pub enum SegmentType { Array, Word, DocSeq, + ArrayPath, } #[derive(Debug, Clone)] @@ -75,16 +79,29 @@ impl KeyBuilder { self.segments.push(Segment{ type_: SegmentType::Word, offset: self.fullkey.len() }); self.fullkey.push('!'); self.fullkey += stemmed_word; - self.fullkey.push('#'); } pub fn push_doc_seq(&mut self, seq: u64) { debug_assert!(self.segments.len() > 0); debug_assert!(self.segments.last().unwrap().type_ == SegmentType::Word); self.segments.push(Segment{ type_: SegmentType::DocSeq, offset: self.fullkey.len() }); + self.fullkey.push('#'); self.fullkey.push_str(seq.to_string().as_str()); } + pub fn push_array_path(&mut self, path: &Vec) { + debug_assert!(self.segments.len() > 0); + debug_assert!(self.segments.last().unwrap().type_ == SegmentType::DocSeq); + self.segments.push(Segment{ type_: SegmentType::ArrayPath, offset: self.fullkey.len() }); + if path.is_empty() { + self.fullkey.push(','); + } + for i in path { + self.fullkey.push(','); + self.fullkey.push_str(i.to_string().as_str()); + } + } + pub fn pop_object_key(&mut self) { debug_assert!(self.segments.last().unwrap().type_ == SegmentType::ObjectKey); self.fullkey.truncate(self.segments.last().unwrap().offset); @@ -110,15 +127,77 @@ impl KeyBuilder { self.segments.pop(); } + pub fn pop_array_path(&mut self) { + debug_assert!(self.segments.last().unwrap().type_ == SegmentType::ArrayPath); + self.fullkey.truncate(self.segments.last().unwrap().offset); + self.segments.pop(); + } + pub fn last_pushed_segment_type(&self) -> Option { self.segments.last().and_then(|segment| Some(segment.type_.clone())) } + + /* splits key into key path parts parsed seq strs + ex "W.foo$.bar$.baz!word#123,0,0" -> ("W.foo$.bar$.bar!word", 123, "0,0") */ + fn split_keypath_seq_arraypath_from_key(str: &str) -> (&str, &str, &str) { + let n = str.rfind("#").unwrap(); + assert!(n != 0); + assert!(n != str.len() - 1); + let seq_array_path_str = &str[(n + 1)..]; + let m = seq_array_path_str.find(",").unwrap(); + + (&str[..n], &seq_array_path_str[..m], &seq_array_path_str[m + 1..]) + } + + /* parses a seq and array path portion (ex "123,0,0,10) of a key into a doc result */ + pub fn parse_doc_result_from_key(str: &str) -> DocResult { + + let mut dr = DocResult::new(); + let (_path_str, seq_str, array_path_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&str); + dr.seq = seq_str.parse().unwrap(); + if !array_path_str.is_empty() { + for numstr in array_path_str.split(",") { + dr.array_path.push(numstr.parse().unwrap()); + } + } + dr + } + + pub fn compare_keys(akey: &str, bkey: &str) -> i32 { + use std::cmp::Ordering; + assert!(akey.starts_with('W')); + assert!(bkey.starts_with('W')); + let (apath_str, aseq_str, aarray_path_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); + let (bpath_str, bseq_str, barray_path_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); + + match apath_str[1..].cmp(&bpath_str[1..]) { + Ordering::Less => -1, + Ordering::Greater => 1, + Ordering::Equal => { + let aseq: u64 = aseq_str.parse().unwrap(); + let bseq: u64 = bseq_str.parse().unwrap();; + if aseq < bseq { + -1 + } else if aseq > bseq { + 1 + } else { + match aarray_path_str.cmp(barray_path_str) { + Ordering::Less => -1, + Ordering::Greater => 1, + Ordering::Equal => 0, + } + } + }, + } + } + } #[cfg(test)] mod tests { use super::{KeyBuilder, SegmentType}; + use query::DocResult; #[test] fn test_new_key_builder() { @@ -146,7 +225,7 @@ mod tests { kb.push_word("astemmedword"); assert_eq!(kb.segments.len(), 4, "Four segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#", "Key for four segments is correct"); + assert_eq!(kb.key(), "W.first.second$!astemmedword", "Key for four segments is correct"); kb.push_doc_seq(123); assert_eq!(kb.segments.len(), 5, "Five segments"); @@ -176,13 +255,20 @@ mod tests { kb.push_array(); kb.push_word("astemmedword"); kb.push_doc_seq(123); + kb.push_array_path(&vec![0]); + + assert_eq!(kb.segments.len(), 6, "six segments"); + assert_eq!(kb.key(), "W.first.second$!astemmedword#123,0", + "Key for six segments is correct"); + + kb.pop_array_path(); assert_eq!(kb.segments.len(), 5, "Five segments"); assert_eq!(kb.key(), "W.first.second$!astemmedword#123", "Key for five segments is correct"); kb.pop_doc_seq(); assert_eq!(kb.segments.len(), 4, "Four segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#", "Key for four segments is correct"); + assert_eq!(kb.key(), "W.first.second$!astemmedword", "Key for four segments is correct"); kb.pop_word(); assert_eq!(kb.segments.len(), 3, "Three segments "); @@ -226,4 +312,26 @@ mod tests { assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::DocSeq), "Last segment is a doc sequence"); } + + #[test] + fn test_doc_result_parse() { + let key = "W.foo$.bar$!word#123,1,0".to_string(); + let (keypathstr, seqstr, arraypathstr) = KeyBuilder::split_keypath_seq_arraypath_from_key(&key); + assert_eq!(keypathstr, "W.foo$.bar$!word"); + assert_eq!(seqstr, "123"); + assert_eq!(arraypathstr, "1,0"); + + // make sure escaped commas and # in key path don't cause problems + let key1 = "W.foo\\#$.bar\\,$!word#123,2,0".to_string(); + let (keypathstr1, seqstr1, arraypathstr1) = KeyBuilder::split_keypath_seq_arraypath_from_key(&key1); + assert_eq!(keypathstr1, "W.foo\\#$.bar\\,$!word"); + assert_eq!(seqstr1, "123"); + assert_eq!(arraypathstr1, "2,0"); + + let mut dr = DocResult::new(); + dr.seq = 123; + dr.array_path = vec![1,0]; + + assert!(dr == KeyBuilder::parse_doc_result_from_key(&key)); + } } diff --git a/src/query.rs b/src/query.rs index 08f0137..ff1aba9 100644 --- a/src/query.rs +++ b/src/query.rs @@ -15,48 +15,31 @@ use records_capnp::payload; - pub struct DocResult { - seq: u64, - array_paths: Vec>, + pub seq: u64, + pub array_path: Vec, } impl DocResult { - fn new() -> DocResult { + pub fn new() -> DocResult { DocResult { seq: 0, - array_paths: Vec::new(), + array_path: Vec::new(), } } +} - fn truncate_array_paths(&mut self, array_depth: usize) { - for array_path in self.array_paths.iter_mut() { - debug_assert!(array_path.len() >= array_depth); - array_path.resize(array_depth, 0); - } - } - fn intersect_array_paths(aa: &DocResult, bb: &DocResult) -> Option { - let mut doc_result = DocResult::new(); - debug_assert_eq!(aa.seq, bb.seq); - doc_result.seq = aa.seq; - for array_path_a in &aa.array_paths { - for array_path_b in &bb.array_paths { - if array_path_a == array_path_b { - doc_result.array_paths.push(array_path_a.clone()); - } - } - } - if doc_result.array_paths.is_empty() { - None - } else { - Some(doc_result) - } +impl PartialEq for DocResult { + fn eq(&self, other: &DocResult) -> bool { + self.seq == other.seq && self.array_path == other.array_path } } +impl Eq for DocResult {} + pub trait QueryRuntimeFilter { - fn first_result(&mut self, start_id: u64) -> Result, Error>; + fn first_result(&mut self, start: &DocResult) -> Result, Error>; fn next_result(&mut self) -> Result, Error>; } @@ -64,7 +47,7 @@ pub struct Query {} pub struct QueryResults<'a> { filter: Box, - first_has_been_called: bool, + doc_result_next: DocResult, snapshot: Snapshot<'a>, } @@ -72,21 +55,18 @@ impl<'a> QueryResults<'a> { fn new(filter: Box, snapshot: Snapshot<'a>) -> QueryResults<'a> { QueryResults{ filter: filter, - first_has_been_called: false, + doc_result_next: DocResult::new(), snapshot: snapshot, } } fn get_next(&mut self) -> Result, Error> { - let doc_result; - if self.first_has_been_called { - doc_result = try!(self.filter.next_result()); - } else { - self.first_has_been_called = true; - doc_result = try!(self.filter.first_result(0)); - } - match doc_result { - Some(doc_result) => Ok(Some(doc_result.seq)), + let result = try!(self.filter.first_result(&self.doc_result_next)); + match result { + Some(doc_result) => { + self.doc_result_next.seq = doc_result.seq + 1; + Ok(Some(doc_result.seq)) + }, None => Ok(None), } } @@ -130,23 +110,23 @@ impl ExactMatchFilter { } impl QueryRuntimeFilter for ExactMatchFilter { - fn first_result(&mut self, start_id: u64) -> Result, Error> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { // Build the full key - self.kb.push_doc_seq(start_id); + self.kb.push_doc_seq(start.seq); + self.kb.push_array_path(&start.array_path); // Seek in index to >= entry self.iter.set_mode(IteratorMode::From(self.kb.key().as_bytes(), rocksdb::Direction::Forward)); // Revert + self.kb.pop_array_path(); self.kb.pop_doc_seq(); self.next_result() } fn next_result(&mut self) -> Result, Error> { - let mut doc_result = DocResult::new(); - loop { if !self.iter.valid() { return Ok(None) @@ -160,9 +140,9 @@ impl QueryRuntimeFilter for ExactMatchFilter { None => return Ok(None), }; if !key.starts_with(self.kb.key().as_bytes()) { + // we passed the key paths we are interested in. nothing left to do */ return Ok(None) } - let seq = &key[self.kb.key().len()..]; // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed // and why we pass on mutable reference of it to `read_message()` @@ -171,24 +151,16 @@ impl QueryRuntimeFilter for ExactMatchFilter { &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); let payload = message_reader.get_root::().unwrap(); - for aos_wis in try!(payload.get_arrayoffsets_to_wordinfos()).iter() { - for wi in try!(aos_wis.get_wordinfos()).iter() { - if self.stemmed_offset == wi.get_stemmed_offset() && - self.suffix_offset == wi.get_suffix_offset() && - self.suffix == try!(wi.get_suffix_text()) { - // We have a candidate document to return - let arrayoffsets = try!(aos_wis.get_arrayoffsets()); - doc_result.array_paths.push(arrayoffsets.iter().collect::<>()); - doc_result.seq = str::from_utf8(&seq).unwrap().parse().unwrap(); - break; - } + for wi in try!(payload.get_wordinfos()).iter() { + if self.stemmed_offset == wi.get_stemmed_offset() && + self.suffix_offset == wi.get_suffix_offset() && + self.suffix == try!(wi.get_suffix_text()) { + // We have a candidate document to return + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + return Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))); } } } - - if doc_result.seq > 0 { - return Ok(Some(doc_result)); - } } } } @@ -198,7 +170,7 @@ impl QueryRuntimeFilter for ExactMatchFilter { struct DummyFilter {} impl QueryRuntimeFilter for DummyFilter { - fn first_result(&mut self, start_id: u64) -> Result, Error> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { Ok(None) } fn next_result(&mut self) -> Result, Error> { @@ -223,56 +195,43 @@ impl<'a> AndFilter<'a> { } fn result(&mut self, base: Option) -> Result, Error> { - let mut matches_count = self.filters.len(); + let mut matches_count = self.filters.len() - 1; // TODO vmx 2016-11-04: Make it nicer let mut base_result = match base { Some(base_result) => base_result, None => return Ok(None), }; - loop { - base_result.truncate_array_paths(self.array_depth); + base_result.array_path.resize(self.array_depth, 0); + loop { self.current_filter += 1; if self.current_filter == self.filters.len() { self.current_filter = 0; } - let next = try!(self.filters[self.current_filter].first_result(base_result.seq)); + let next = try!(self.filters[self.current_filter].first_result(&base_result)); let mut next_result = match next { Some(next_result) => next_result, None => return Ok(None), }; - next_result.truncate_array_paths(self.array_depth); - - if base_result.seq == next_result.seq { - match DocResult::intersect_array_paths(&base_result, &next_result) { - Some(new_result) => { - base_result = new_result; - matches_count -= 1; - if matches_count == 0 { - return Ok(Some(base_result)); - } - }, - None => { - let new_result = try!(self.filters[self.current_filter].first_result(base_result.seq)); - base_result = match new_result { - Some(base_result) => base_result, - None => return Ok(None), - }; - matches_count = self.filters.len() - } + next_result.array_path.resize(self.array_depth, 0); + + if base_result == next_result { + matches_count -= 1; + if matches_count == 0 { + return Ok(Some(base_result)); } } else { base_result = next_result; - matches_count = self.filters.len(); + matches_count = self.filters.len() - 1; } } } } impl<'a> QueryRuntimeFilter for AndFilter<'a> { - fn first_result(&mut self, start_id: u64) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].first_result(start_id)); + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + let base_result = try!(self.filters[self.current_filter].first_result(start)); self.result(base_result) } @@ -598,22 +557,33 @@ mod tests { let mut query_results = Query::get_matches(r#"A[B = "B2" & C[ D = "D" ]]"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); query_results = Query::get_matches(r#"A[B = "B2" & C = "C2"]"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); query_results = Query::get_matches(r#"A[B = "b1" & C = "C2"]"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); query_results = Query::get_matches(r#"A = "Multi word sentence""#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); query_results = Query::get_matches(r#"A = "%&%}{}@);€""#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("4".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); query_results = Query::get_matches(r#"A = "{}€52 deeply \\n\\v ""#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("5".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"A[C = "C2"]"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); } #[test] From 2929706e071e6d5339559ec72f710976a26fb190 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 1 Dec 2016 13:21:47 -0800 Subject: [PATCH 035/122] fixed comment --- src/key_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/key_builder.rs b/src/key_builder.rs index 8c01473..66c4d15 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -137,8 +137,8 @@ impl KeyBuilder { self.segments.last().and_then(|segment| Some(segment.type_.clone())) } - /* splits key into key path parts parsed seq strs - ex "W.foo$.bar$.baz!word#123,0,0" -> ("W.foo$.bar$.bar!word", 123, "0,0") */ + /* splits key into key path, seq and array path + ex "W.foo$.bar$.baz!word#123,0,0" -> ("W.foo$.bar$.bar!word", "123", "0,0") */ fn split_keypath_seq_arraypath_from_key(str: &str) -> (&str, &str, &str) { let n = str.rfind("#").unwrap(); assert!(n != 0); From c779e02449d7db6f7a099cdfcdd1e23a441217d0 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 6 Dec 2016 16:35:26 +0100 Subject: [PATCH 036/122] Fix JSON shredder tests One test is ignored as it isn't supposed to work. The tokenizer doesn't work as expected. It's left there as are reminder to fix this. --- src/json_shred.rs | 150 ++++++++++++++++++++++++---------------------- 1 file changed, 80 insertions(+), 70 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 2f1af53..43f103e 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -241,92 +241,98 @@ impl Shredder { } } -/* #[cfg(test)] mod tests { - + extern crate rocksdb; + use std::str; + use records_capnp; use super::{WordInfo}; + + fn wordinfos_from_rocks(rocks: rocksdb::DB) -> Vec<(String, Vec)> { + let mut result = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + let mut ref_value = &*value; + let message_reader = ::capnp::serialize_packed::read_message( + &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + let payload = message_reader.get_root::().unwrap(); + + let mut wordinfos = Vec::new(); + for wi in payload.get_wordinfos().unwrap().iter() { + wordinfos.push(WordInfo{ + stemmed_offset: wi.get_stemmed_offset(), + suffix_text: wi.get_suffix_text().unwrap().to_string(), + suffix_offset: wi.get_suffix_offset(), + }); + } + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + result.push((key_string, wordinfos)); + } + result + } + + #[test] - fn test_shred_nested() { let mut shredder = super::Shredder::new(); - //let json = r#"{"hello": {"my": "world!"}, "anumber": 2}"#; - //let json = r#"{"A":[{"B":"B2VMX two three","C":"C2"},{"B": "b1","C":"C2"}]}"#; - //let json = r#"{"A":[[[{"B": "string within deeply nested array should be stemmed"}]]]}"#; - //let json = r#"[{"A": 1, "B": 2, "C": 3}]"#; - //let json = r#"{"foo": {"bar": 1}}"#; let json = r#"{"some": ["array", "data", ["also", "nested"]]}"#; let docseq = 123; - shredder.shred(json, docseq).unwrap(); + let batch = rocksdb::WriteBatch::default(); + shredder.shred(json, docseq, &batch).unwrap(); + + let rocks = rocksdb::DB::open_default("target/tests/test_shred_netsted").unwrap(); + rocks.write(batch).unwrap(); + let result = wordinfos_from_rocks(rocks); + let expected = vec![ - ("W.some$!array#123,0", vec![ - (vec![0], vec![WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }])]), - ("W.some$!data#123,1", vec![ - (vec![1], vec![WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }])]), - ("W.some$$!also#123,2,0", vec![ - (vec![2, 0], vec![WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }])]), - ("W.some$$!nest#1232,1", vec![ - (vec![2, 1], vec![WordInfo { - stemmed_offset: 0, suffix_text: "ed".to_string(), suffix_offset: 4 }])]), + ("W.some$!array#123,0".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }]), + ("W.some$!data#123,1".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), + ("W.some$$!also#123,2,0".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), + ("W.some$$!nest#123,2,1".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "ed".to_string(), suffix_offset: 4 }]), ]; - compare_shredded(&shredder.map, &expected); + assert_eq!(result, expected); } #[test] + // NOTE vmx 2016-12-06: This test is intentionally made to fail (hence ignored) as the current + // current tokenizer does the wrong thing when it comes to numbers within words. It's left + // here as a reminder to fix that + #[ignore] fn test_shred_objects() { let mut shredder = super::Shredder::new(); let json = r#"{"A":[{"B":"B2VMX two three","C":"..C2"},{"B": "b1","C":"..C2"}]}"#; let docseq = 1234; - shredder.shred(json, docseq).unwrap(); + let batch = rocksdb::WriteBatch::default(); + shredder.shred(json, docseq, &batch).unwrap(); + + let rocks = rocksdb::DB::open_default("target/tests/test_shred_objects").unwrap(); + rocks.write(batch).unwrap(); + let result = wordinfos_from_rocks(rocks); + println!("result: {:?}", result); let expected = vec![ - ("W.A$.B!b1#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }])]), - ("W.A$.B!b2vmx#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 0, suffix_text: "B2VMX ".to_string(), suffix_offset: 0 }])]), - ("W.A$.B!three#1234", vec![ - (vec![0], vec![WordInfo { - stemmed_offset: 10, suffix_text: "".to_string(), suffix_offset: 15 }])]), - ("W.A$.B!two#1234", vec![ - (vec![0], vec![WordInfo { - stemmed_offset: 6, suffix_text: " ".to_string(), suffix_offset: 9 }])]), - ("W.A$.C!..#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }, - WordInfo { - stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }])]), - ("W.A$.C!c2#1234", vec![ - (vec![0], vec![ - WordInfo { - stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }, - WordInfo { - stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }])]), + ("W.A$.B!b1#1234,1".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), + ("W.A$.B!b2vmx#1234,0".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "B2 VMX ".to_string(), + suffix_offset: 0 }]), + ("W.A$.B!three#1234,0".to_string(), vec![ + WordInfo { stemmed_offset: 10, suffix_text: "".to_string(), suffix_offset: 15 }]), + ("W.A$.B!two#1234,0".to_string(), vec![ + WordInfo { stemmed_offset: 6, suffix_text: " ".to_string(), suffix_offset: 9 }]), + ("W.A$.C!..#1234,0".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), + ("W.A$.C!..#1234,1".to_string(), vec![ + WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), + ("W.A$.C!c2#1234,0".to_string(), vec![ + WordInfo { stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), + ("W.A$.C!c2#1234,1".to_string(), vec![ + WordInfo { stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), ]; - compare_shredded(&shredder.map, &expected); - } - - fn compare_shredded(result_map: &WordPathInfoMap, - expected: &Vec<(&str, Vec<(Vec, Vec)>)>) { - // HashMap have an arbitrary order of the elements - let mut result: Vec<(&String, &ArrayOffsetsToWordInfo)> = result_map.into_iter().collect(); - result.sort_by(|a, b| Ord::cmp(&a.0, &b.0)); - for (ii, &(key, values)) in result.iter().enumerate() { - assert_eq!(key, expected[ii].0); - let mut wordinfos: Vec<(&Vec, &Vec)> = values.iter().collect(); - wordinfos.sort_by_key(|item| item.0); - for (jj, wordinfo) in wordinfos.iter().enumerate() { - assert_eq!(wordinfo.0, &expected[ii].1[jj].0); - assert_eq!(wordinfo.1, &expected[ii].1[jj].1); - } - } + assert_eq!(result, expected); } #[test] @@ -334,8 +340,12 @@ mod tests { let mut shredder = super::Shredder::new(); let json = r#"{}"#; let docseq = 123; - shredder.shred(json, docseq).unwrap(); - assert!(shredder.map.is_empty()); - } + let batch = rocksdb::WriteBatch::default(); + shredder.shred(json, docseq, &batch).unwrap(); + + let rocks = rocksdb::DB::open_default("target/tests/test_shred_empty_object").unwrap(); + rocks.write(batch).unwrap(); + let result = wordinfos_from_rocks(rocks); + assert!(result.is_empty()); + } } -*/ From 4c6f19effeff0b74c0020ad8fa3a8779ebf3fb7a Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 7 Dec 2016 11:16:51 -0800 Subject: [PATCH 037/122] Store non-inverted document bodies Store the document bodies of values into the database, necessary for fast retrieval and aggregations. --- src/json_shred.rs | 74 +++++------ src/key_builder.rs | 312 +++++++++++++++++---------------------------- src/query.rs | 42 +++--- 3 files changed, 169 insertions(+), 259 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 2f1af53..08fca91 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -8,7 +8,7 @@ use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; use self::rocksdb::Writable; use error::Error; -use key_builder::{KeyBuilder, SegmentType}; +use key_builder::KeyBuilder; use records_capnp::payload; use stems::Stems; @@ -43,10 +43,14 @@ enum ObjectKeyTypes { NoKey, } +pub trait Indexable { + +} + + #[derive(Debug)] pub struct Shredder { - keybuilder: KeyBuilder, - path_array_offsets: ArrayOffsets, + kb: KeyBuilder, // Top-level fields prefixed with an underscore are ignored ignore_children: u64, doc_id: String, @@ -56,8 +60,7 @@ pub struct Shredder { impl Shredder { pub fn new() -> Shredder { Shredder{ - keybuilder: KeyBuilder::new(), - path_array_offsets: Vec::new(), + kb: KeyBuilder::new(), ignore_children: 0, doc_id: String::new(), } @@ -87,39 +90,25 @@ impl Shredder { capn_wordinfo.set_suffix_offset(word_info.suffix_offset); } } - self.keybuilder.push_word(&stemmed); - self.keybuilder.push_doc_seq(docseq); - self.keybuilder.push_array_path(&self.path_array_offsets); let mut bytes = Vec::new(); ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); - try!(batch.put(&self.keybuilder.key().into_bytes(), &bytes)); + let key = self.kb.stemmed_word_key(&stemmed, docseq); + try!(batch.put(&key.into_bytes(), &bytes)); - self.keybuilder.pop_array_path(); - self.keybuilder.pop_doc_seq(); - self.keybuilder.pop_word(); } - Ok(()) - } + let key = self.kb.value_key(docseq); + try!(batch.put(&key.into_bytes(), &text.as_bytes())); - - fn inc_top_array_offset(&mut self) { - // we encounter a new element. if we are a child element of an array - // increment the offset. If we aren't (we are the root value or a map - // value) we don't increment - if let Some(SegmentType::Array) = self.keybuilder.last_pushed_segment_type() { - if let Some(last) = self.path_array_offsets.last_mut() { - *last += 1; - } - } + Ok(()) } // Extract key if it exists and indicates if it's a special type of key fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { - if self.keybuilder.last_pushed_segment_type().unwrap() == SegmentType::ObjectKey { + if self.kb.last_pushed_keypath_is_object_key() { match stack_element { Some(StackElement::Key(key)) => { - if self.keybuilder.segments.len() == 1 && key.starts_with("_") { + if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { if key == "_id" { ObjectKeyTypes::Id } else { @@ -139,9 +128,9 @@ impl Shredder { // If we are inside an object we need to push the key to the key builder // Don't push them if they are reserved fields (starting with underscore) fn maybe_push_key(&mut self, stack_element: Option) -> Result<(), Error> { - if self.keybuilder.last_pushed_segment_type().unwrap() == SegmentType::ObjectKey { + if self.kb.last_pushed_keypath_is_object_key() { if let Some(StackElement::Key(key)) = stack_element { - if self.keybuilder.segments.len() == 1 && key.starts_with("_") { + if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { if key == "_id" { return Err(Error::Shred( "Expected string for `_id` field, got another type".to_string())); @@ -151,8 +140,8 @@ impl Shredder { } else { // Pop the dummy object that makes ObjectEnd happy // or the previous object key - self.keybuilder.pop_object_key(); - self.keybuilder.push_object_key(key.to_string()); + self.kb.pop_object_key(); + self.kb.push_object_key(key); } } } @@ -162,7 +151,6 @@ impl Shredder { pub fn shred(&mut self, json: &str, docseq: u64, batch: &rocksdb::WriteBatch) -> Result { let mut parser = Parser::new(json.chars()); let mut token = parser.next(); - loop { // Get the next token, so that in case of an `ObjectStart` the key is already // on the stack. @@ -173,15 +161,15 @@ impl Shredder { } else { // Just push something to make `ObjectEnd` happy - self.keybuilder.push_object_key("".to_string()); + self.kb.push_object_key(""); } }, Some(JsonEvent::ObjectEnd) => { if self.ignore_children > 0 { self.ignore_children -= 1; - } else {//if !self.keybuilder.segments.is_empty() { - self.keybuilder.pop_object_key(); - self.inc_top_array_offset(); + } else { + self.kb.pop_object_key(); + self.kb.inc_top_array_offset(); } }, Some(JsonEvent::ArrayStart) => { @@ -189,37 +177,33 @@ impl Shredder { if self.ignore_children > 0 { self.ignore_children += 1; } else { - self.keybuilder.push_array(); - self.path_array_offsets.push(0); + self.kb.push_array(); } }, Some(JsonEvent::ArrayEnd) => { if self.ignore_children > 0 { self.ignore_children -= 1; } else { - self.path_array_offsets.pop(); - self.keybuilder.pop_array(); - self.inc_top_array_offset(); + self.kb.pop_array(); + self.kb.inc_top_array_offset(); } }, Some(JsonEvent::StringValue(value)) => { // No children to ignore if self.ignore_children == 0 { - println!("stringvalue: {:?}", value); match self.extract_key(parser.stack().top()) { ObjectKeyTypes::Id => self.doc_id = value, ObjectKeyTypes::Key(key) => { // Pop the dummy object that makes ObjectEnd happy // or the previous object key - self.keybuilder.pop_object_key(); - self.keybuilder.push_object_key(key.to_string()); + self.kb.pop_object_key(); + self.kb.push_object_key(&key);; try!(self.add_entries(&value, docseq, &batch)); - self.inc_top_array_offset(); }, ObjectKeyTypes::NoKey => { try!(self.add_entries(&value, docseq, &batch)); - self.inc_top_array_offset(); + self.kb.inc_top_array_offset(); }, ObjectKeyTypes::Ignore => { self.ignore_children = 1; diff --git a/src/key_builder.rs b/src/key_builder.rs index 66c4d15..c79a125 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -1,140 +1,122 @@ use query::DocResult; use std::str; -//#[derive(PartialEq, Eq)] -#[derive(Debug, Clone, PartialEq)] -pub enum SegmentType { - // BuildState is really simple state tracker to prevent misuse of api - ObjectKey, - Array, - Word, - DocSeq, - ArrayPath, -} - -#[derive(Debug, Clone)] -pub struct Segment { - type_: SegmentType, - offset: usize, -} #[derive(Debug, Clone)] pub struct KeyBuilder { - pub array_depth: usize, - pub segments: Vec, - fullkey: String, + keypath: Vec, + arraypath: Vec, } - impl KeyBuilder { pub fn new() -> KeyBuilder { - let mut kb = KeyBuilder{ - array_depth: 0, - // Magic reserve numbers that are completely arbitrary - segments: Vec::with_capacity(10), - fullkey: String::with_capacity(100), - }; - // First char is keyspace identifier. W means Word keyspace - kb.fullkey.push('W'); - return kb; + KeyBuilder{ + // Magic reserve number is completely arbitrary + keypath: Vec::with_capacity(10), + arraypath: Vec::with_capacity(10), + } } - // NOTE vmx 2016-10-28: This one is just a port of the C++ prototype, but not yet needed here - //fn segments_count(&self) -> usize { - // self.segments.len() - //} + /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath + /// built up internally. + pub fn stemmed_word_key(&self, word: &str, seq: u64) -> String { + self.stemmed_word_key_internal(word, seq, &self.arraypath) + } - pub fn key(&self) -> String { - self.fullkey.clone() + /// Builds a stemmed word key for the input word and doc result, using the key_path built up + /// internally but ignoring the internal array path. Instead uses the array path from the + /// DocResult + pub fn stemmed_word_key_from_doc_result(&self, word: &str, dr: &DocResult) -> String { + self.stemmed_word_key_internal(word, dr.seq, &dr.arraypath) } - pub fn push_object_key(&mut self, key: String) { - debug_assert!(self.segments.len() == 0 || - self.segments.last().unwrap().type_ == SegmentType::ObjectKey || - self.segments.last().unwrap().type_ == SegmentType::Array); - self.segments.push(Segment{ type_: SegmentType::ObjectKey, offset: self.fullkey.len() }); - self.fullkey.push('.'); - for cc in key.chars() { - // Escape chars that conflict with delimiters - if "\\$.!#".contains(cc) { - self.fullkey.push('\\'); - } - self.fullkey.push(cc); + fn stemmed_word_key_internal(&self, word: &str, seq: u64, arraypath: &Vec) -> String { + let mut string = String::with_capacity(100); + string.push('W'); + for segment in &self.keypath { + string.push_str(&segment); } - } + string.push('!'); + string.push_str(word); + string.push('#'); + string.push_str(seq.to_string().as_str()); - pub fn push_array(&mut self) { - debug_assert!(self.segments.len() == 0 || - self.segments.last().unwrap().type_ == SegmentType::ObjectKey || - self.segments.last().unwrap().type_ == SegmentType::Array); - self.segments.push(Segment{ type_: SegmentType::Array, offset: self.fullkey.len() }); - self.fullkey.push('$'); - self.array_depth += 1; + self.add_arraypath(&mut string, &arraypath); + string } - pub fn push_word(&mut self, stemmed_word: &str) { - debug_assert!(self.segments.len() > 0); - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::ObjectKey || - self.segments.last().unwrap().type_ == SegmentType::Array); - self.segments.push(Segment{ type_: SegmentType::Word, offset: self.fullkey.len() }); - self.fullkey.push('!'); - self.fullkey += stemmed_word; - } + /// Builds a value key for seq (value keys are the original json terminal value with + /// keyed on keypath and arraypath built up internally). + pub fn value_key(&self, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push('V'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('#'); + string.push_str(&seq.to_string()); - pub fn push_doc_seq(&mut self, seq: u64) { - debug_assert!(self.segments.len() > 0); - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::Word); - self.segments.push(Segment{ type_: SegmentType::DocSeq, offset: self.fullkey.len() }); - self.fullkey.push('#'); - self.fullkey.push_str(seq.to_string().as_str()); + self.add_arraypath(&mut string, &self.arraypath); + string } - pub fn push_array_path(&mut self, path: &Vec) { - debug_assert!(self.segments.len() > 0); - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::DocSeq); - self.segments.push(Segment{ type_: SegmentType::ArrayPath, offset: self.fullkey.len() }); - if path.is_empty() { - self.fullkey.push(','); + fn add_arraypath(&self, string: &mut String, arraypath: &Vec) { + if arraypath.is_empty() { + string.push(','); + } else { + for i in arraypath { + string.push(','); + string.push_str(i.to_string().as_str()); + } } - for i in path { - self.fullkey.push(','); - self.fullkey.push_str(i.to_string().as_str()); + + } + + pub fn push_object_key(&mut self, key: &str) { + let mut escaped_key = String::with_capacity((key.len() * 2) + 1); // max expansion + escaped_key.push('.'); + for cc in key.chars() { + // Escape chars that conflict with delimiters + if "\\$.!#,".contains(cc) { + escaped_key.push('\\'); + } + escaped_key.push(cc); } + self.keypath.push(escaped_key); + } + + pub fn push_array(&mut self) { + self.keypath.push("$".to_string()); + self.arraypath.push(0); } pub fn pop_object_key(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::ObjectKey); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.segments.pop(); + debug_assert!(self.keypath.last().unwrap().starts_with(".")); + self.keypath.pop(); } pub fn pop_array(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::Array); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.array_depth -= 1; - self.segments.pop(); + debug_assert!(self.keypath.last().unwrap() == "$"); + self.arraypath.pop(); + self.keypath.pop(); } - pub fn pop_word(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::Word); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.segments.pop(); + pub fn inc_top_array_offset(&mut self) { + if self.keypath.len() > 0 && self.keypath.last().unwrap() == "$" { + *self.arraypath.last_mut().unwrap() += 1; + } } - pub fn pop_doc_seq(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::DocSeq); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.segments.pop(); + pub fn arraypath_len(&self) -> usize { + self.arraypath.len() } - pub fn pop_array_path(&mut self) { - debug_assert!(self.segments.last().unwrap().type_ == SegmentType::ArrayPath); - self.fullkey.truncate(self.segments.last().unwrap().offset); - self.segments.pop(); + pub fn last_pushed_keypath_is_object_key(&self) -> bool { + self.keypath.last().unwrap().starts_with(".") } - pub fn last_pushed_segment_type(&self) -> Option { - self.segments.last().and_then(|segment| Some(segment.type_.clone())) + pub fn keypath_segments_len(&self) -> usize { + self.keypath.len() } /* splits key into key path, seq and array path @@ -143,21 +125,27 @@ impl KeyBuilder { let n = str.rfind("#").unwrap(); assert!(n != 0); assert!(n != str.len() - 1); - let seq_array_path_str = &str[(n + 1)..]; - let m = seq_array_path_str.find(",").unwrap(); + let seq_arraypath_str = &str[(n + 1)..]; + let m = seq_arraypath_str.find(",").unwrap(); - (&str[..n], &seq_array_path_str[..m], &seq_array_path_str[m + 1..]) + (&str[..n], &seq_arraypath_str[..m], &seq_arraypath_str[m + 1..]) + } + + pub fn get_keypathword_only(&self, stemmed: &str) -> String { + let mut key = self.stemmed_word_key(stemmed, 0); + let n = key.rfind("#").unwrap(); + key.truncate(n + 1); + key } /* parses a seq and array path portion (ex "123,0,0,10) of a key into a doc result */ pub fn parse_doc_result_from_key(str: &str) -> DocResult { - let mut dr = DocResult::new(); - let (_path_str, seq_str, array_path_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&str); + let (_path_str, seq_str, arraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&str); dr.seq = seq_str.parse().unwrap(); - if !array_path_str.is_empty() { - for numstr in array_path_str.split(",") { - dr.array_path.push(numstr.parse().unwrap()); + if !arraypath_str.is_empty() { + for numstr in arraypath_str.split(",") { + dr.arraypath.push(numstr.parse().unwrap()); } } dr @@ -167,8 +155,8 @@ impl KeyBuilder { use std::cmp::Ordering; assert!(akey.starts_with('W')); assert!(bkey.starts_with('W')); - let (apath_str, aseq_str, aarray_path_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); - let (bpath_str, bseq_str, barray_path_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); + let (apath_str, aseq_str, aarraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); + let (bpath_str, bseq_str, barraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); match apath_str[1..].cmp(&bpath_str[1..]) { Ordering::Less => -1, @@ -181,7 +169,7 @@ impl KeyBuilder { } else if aseq > bseq { 1 } else { - match aarray_path_str.cmp(barray_path_str) { + match aarraypath_str.cmp(barraypath_str) { Ordering::Less => -1, Ordering::Greater => 1, Ordering::Equal => 0, @@ -196,121 +184,61 @@ impl KeyBuilder { #[cfg(test)] mod tests { - use super::{KeyBuilder, SegmentType}; + use super::{KeyBuilder}; use query::DocResult; - #[test] - fn test_new_key_builder() { - let kb = KeyBuilder::new(); - assert_eq!(kb.key(), "W", "Initial value is set"); - } #[test] fn test_segments_push() { let mut kb = KeyBuilder::new(); - assert_eq!(kb.segments.len(), 0, "No segments so far"); - assert_eq!(kb.key(), "W", "Key for segments is correct"); + assert_eq!(kb.keypath_segments_len(), 0, "No segments so far"); - kb.push_object_key("first".to_string()); - assert_eq!(kb.segments.len(), 1, "One segment"); - assert_eq!(kb.key(), "W.first", "Key for one segments is correct"); + kb.push_object_key("first"); + assert_eq!(kb.keypath_segments_len(), 1, "One segment"); - kb.push_object_key("second".to_string()); - assert_eq!(kb.segments.len(), 2, "Two segments"); - assert_eq!(kb.key(), "W.first.second", "Key for two segments is correct"); + kb.push_object_key("second"); + assert_eq!(kb.keypath_segments_len(), 2, "Two segments"); kb.push_array(); - assert_eq!(kb.segments.len(), 3, "Three segments "); - assert_eq!(kb.key(), "W.first.second$", "Key for three segments is correct"); - - kb.push_word("astemmedword"); - assert_eq!(kb.segments.len(), 4, "Four segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword", "Key for four segments is correct"); - - kb.push_doc_seq(123); - assert_eq!(kb.segments.len(), 5, "Five segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#123", - "Key for five segments is correct"); - } - - #[test] - #[should_panic(expected = "assertion failed: self.segments.len() > 0")] - fn test_segments_push_doc_seq_panic() { - let mut kb = KeyBuilder::new(); - kb.push_doc_seq(456); - } - - #[test] - #[should_panic(expected = "assertion failed: self.segments.len() > 0")] - fn test_segments_push_word_panic() { - let mut kb = KeyBuilder::new(); - kb.push_word("astemmedword"); + assert_eq!(kb.keypath_segments_len(), 3, "Three segments "); } #[test] fn test_segments_pop() { let mut kb = KeyBuilder::new(); - kb.push_object_key("first".to_string()); - kb.push_object_key("second".to_string()); + kb.push_object_key("first"); + kb.push_object_key("second"); kb.push_array(); - kb.push_word("astemmedword"); - kb.push_doc_seq(123); - kb.push_array_path(&vec![0]); - assert_eq!(kb.segments.len(), 6, "six segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#123,0", + assert_eq!(kb.keypath_segments_len(), 3, "three segments"); + assert_eq!(kb.stemmed_word_key("astemmedword", 123), "W.first.second$!astemmedword#123,0", "Key for six segments is correct"); - kb.pop_array_path(); - assert_eq!(kb.segments.len(), 5, "Five segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword#123", - "Key for five segments is correct"); - - kb.pop_doc_seq(); - assert_eq!(kb.segments.len(), 4, "Four segments"); - assert_eq!(kb.key(), "W.first.second$!astemmedword", "Key for four segments is correct"); - - kb.pop_word(); - assert_eq!(kb.segments.len(), 3, "Three segments "); - assert_eq!(kb.key(), "W.first.second$", "Key for three segments is correct"); kb.pop_array(); - assert_eq!(kb.segments.len(), 2, "Two segments"); - assert_eq!(kb.key(), "W.first.second", "Key for two segments is correct"); + assert_eq!(kb.keypath_segments_len(), 2, "Two segments"); kb.pop_object_key(); - assert_eq!(kb.segments.len(), 1, "One segment"); - assert_eq!(kb.key(), "W.first", "Key for one segments is correct"); + assert_eq!(kb.keypath_segments_len(), 1, "One segment"); kb.pop_object_key(); - assert_eq!(kb.segments.len(), 0, "No segments so far"); - assert_eq!(kb.key(), "W", "Key for segments is correct"); + assert_eq!(kb.keypath_segments_len(), 0, "No segments so far"); } #[test] fn test_last_pushed_segment_type() { let mut kb = KeyBuilder::new(); - assert_eq!(kb.last_pushed_segment_type(), None, "No segments"); + assert_eq!(kb.keypath_segments_len(), 0, "No segments"); - kb.push_object_key("first".to_string()); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::ObjectKey), - "Last segment is an object key"); + kb.push_object_key("first"); + assert!(kb.last_pushed_keypath_is_object_key(), "Last segment is an object key"); - kb.push_object_key("second".to_string()); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::ObjectKey), - "Last segment is an object key"); + kb.push_object_key("second"); + assert!(kb.last_pushed_keypath_is_object_key(), "Last segment is an object key"); kb.push_array(); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::Array), - "Last segment is an array"); - - kb.push_word("astemmedword"); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::Word), - "Last segment is a word"); - - kb.push_doc_seq(123); - assert_eq!(kb.last_pushed_segment_type(), Some(SegmentType::DocSeq), - "Last segment is a doc sequence"); + assert!(!kb.last_pushed_keypath_is_object_key(), "Last segment is an array"); +; } #[test] @@ -330,7 +258,7 @@ mod tests { let mut dr = DocResult::new(); dr.seq = 123; - dr.array_path = vec![1,0]; + dr.arraypath = vec![1,0]; assert!(dr == KeyBuilder::parse_doc_result_from_key(&key)); } diff --git a/src/query.rs b/src/query.rs index ff1aba9..ae14dcc 100644 --- a/src/query.rs +++ b/src/query.rs @@ -17,14 +17,14 @@ use records_capnp::payload; pub struct DocResult { pub seq: u64, - pub array_path: Vec, + pub arraypath: Vec, } impl DocResult { pub fn new() -> DocResult { DocResult { seq: 0, - array_path: Vec::new(), + arraypath: Vec::new(), } } } @@ -32,7 +32,7 @@ impl DocResult { impl PartialEq for DocResult { fn eq(&self, other: &DocResult) -> bool { - self.seq == other.seq && self.array_path == other.array_path + self.seq == other.seq && self.arraypath == other.arraypath } } @@ -91,17 +91,21 @@ impl<'a> QueryResults<'a> { struct ExactMatchFilter { iter: DBIterator, kb: KeyBuilder, + keypathword: String, + stemmed: String, stemmed_offset: u64, suffix: String, suffix_offset: u64, } impl ExactMatchFilter { - fn new(iter: DBIterator, stemmed_word: &StemmedWord, mut kb: KeyBuilder) -> ExactMatchFilter { - kb.push_word(&stemmed_word.stemmed); + fn new(iter: DBIterator, stemmed_word: &StemmedWord, kb: KeyBuilder) -> ExactMatchFilter { + let keypathword = kb.get_keypathword_only(&stemmed_word.stemmed); ExactMatchFilter{ iter: iter, kb: kb, + keypathword: keypathword, + stemmed: stemmed_word.stemmed.clone(), stemmed_offset: stemmed_word.stemmed_offset as u64, suffix: stemmed_word.suffix.clone(), suffix_offset: stemmed_word.suffix_offset as u64, @@ -111,18 +115,12 @@ impl ExactMatchFilter { impl QueryRuntimeFilter for ExactMatchFilter { fn first_result(&mut self, start: &DocResult) -> Result, Error> { - // Build the full key - self.kb.push_doc_seq(start.seq); - self.kb.push_array_path(&start.array_path); + let key = self.kb.stemmed_word_key_from_doc_result(&self.stemmed, &start); // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.kb.key().as_bytes(), + self.iter.set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); - // Revert - self.kb.pop_array_path(); - self.kb.pop_doc_seq(); - self.next_result() } @@ -139,8 +137,8 @@ impl QueryRuntimeFilter for ExactMatchFilter { Some((key, value)) => (key, value), None => return Ok(None), }; - if !key.starts_with(self.kb.key().as_bytes()) { - // we passed the key paths we are interested in. nothing left to do */ + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ return Ok(None) } @@ -201,7 +199,7 @@ impl<'a> AndFilter<'a> { Some(base_result) => base_result, None => return Ok(None), }; - base_result.array_path.resize(self.array_depth, 0); + base_result.arraypath.resize(self.array_depth, 0); loop { self.current_filter += 1; @@ -214,7 +212,7 @@ impl<'a> AndFilter<'a> { Some(next_result) => next_result, None => return Ok(None), }; - next_result.array_path.resize(self.array_depth, 0); + next_result.arraypath.resize(self.array_depth, 0); if base_result == next_result { matches_count -= 1; @@ -379,7 +377,7 @@ ws if filters.len() == 1 { Ok(filters.pop().unwrap()) } else { - Ok(Box::new(AndFilter::new(filters, self.kb.array_depth))) + Ok(Box::new(AndFilter::new(filters, self.kb.arraypath_len()))) } } @@ -416,14 +414,14 @@ ws match self.consume_field() { Some(field) => { if self.consume(".") { - self.kb.push_object_key(field); + self.kb.push_object_key(&field); let ret = self.compare(); self.kb.pop_object_key(); ret } else if self.consume("=") { match self.consume_string_literal() { Ok(Some(literal)) => { - self.kb.push_object_key(field); + self.kb.push_object_key(&field); let stems = Stems::new(&literal); let mut filters: Vec> = Vec::new(); @@ -440,7 +438,7 @@ ws 0 => panic!("Cannot create a ExactMatchFilter"), 1 => Ok(filters.pop().unwrap()), _ => Ok(Box::new(AndFilter::new( - filters, self.kb.array_depth))), + filters, self.kb.arraypath_len()))), } }, // Empty literal @@ -450,7 +448,7 @@ ws } } } else if self.could_consume("[") { - self.kb.push_object_key(field); + self.kb.push_object_key(&field); let ret = self.array(); self.kb.pop_object_key(); ret From d3a913274379c5c83d939ad4e68e69a4b1b2f35f Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 7 Dec 2016 11:37:55 -0800 Subject: [PATCH 038/122] Merge Pull Request: Use rust-rocksdb 0.5.0 With the 0.5.0 release of rust-rocksdb we don't need to pull it directly from Github, but can use that version. --- Cargo.toml | 4 +--- src/index.rs | 24 +++++++----------------- src/json_shred.rs | 15 ++++++++------- tests/rocksdb.rs | 2 +- 4 files changed, 17 insertions(+), 28 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e82aa4a..63b292c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,9 +16,7 @@ rustc-serialize= "0.3.19" stemmer = "0.3.2" unicode-normalization = "0.1.2" unicode-segmentation = "0.1.2" - -[dependencies.rocksdb] -git = "https://github.com/spacejam/rust-rocksdb.git" +rocksdb = "0.5.0" [build-dependencies] diff --git a/src/index.rs b/src/index.rs index 2f29270..dbb7046 100644 --- a/src/index.rs +++ b/src/index.rs @@ -4,8 +4,6 @@ use std::collections::HashMap; use std::str; use records_capnp::header; -// Needed for a trait in order to `dekete/put()` into a `rocksdb::WriteBatch` -use self::rocksdb::Writable; use error::Error; use json_shred::{Shredder}; @@ -113,7 +111,8 @@ impl Index { // NOTE vmx 2016-10-13: Needed for the lifetime-checker, though not sure if it now really // does the right thing. Does the `try!()` still return as epected? { - let docid = try!(shredder.shred(json, self.high_doc_seq + 1, &self.batch())); + let docid = try!(shredder.shred(json, self.high_doc_seq + 1, + self.batch.as_mut().unwrap())); self.high_doc_seq += 1; self.id_str_to_id_seq.insert(format!("I{}", docid), format!("S{}", self.high_doc_seq)); } @@ -132,7 +131,7 @@ impl Index { // TODO vmx 2016-10-17: USe multiget once the Rusts wrapper supports it match rocks.get(key.as_bytes()) { Ok(Some(seq)) => { - try!(self.batch().delete(&*seq)); + try!(self.batch.as_mut().unwrap().delete(&*seq)); }, _ => {} } @@ -140,17 +139,17 @@ impl Index { // Add the ids_to_seq keyspace entries for (id, seq) in &self.id_str_to_id_seq { - try!(self.batch().put(id.as_bytes(), seq.as_bytes())); - try!(self.batch().put(seq.as_bytes(), id.as_bytes())); + try!(self.batch.as_mut().unwrap().put(id.as_bytes(), seq.as_bytes())); + try!(self.batch.as_mut().unwrap().put(seq.as_bytes(), id.as_bytes())); } let mut header = Header::new(); header.high_seq = self.high_doc_seq; - try!(self.batch().put(b"HDB", &*header.serialize())); + try!(self.batch.as_mut().unwrap().put(b"HDB", &*header.serialize())); let status = try!(rocks.write(self.batch.take().unwrap())); // Make sure there's a always a valid WriteBarch after writing it into RocksDB, - // else calls to `self.batch()` would panic. + // else calls to `self.batch.as_mut().unwrap()` would panic. self.batch = Some(rocksdb::WriteBatch::default()); self.id_str_to_id_seq.clear(); Ok(status) @@ -170,15 +169,6 @@ impl Index { } } - /// Returns the current write batch as reference - /// - /// # Panics - /// - /// Panic if there currently is no `WriteBatch` (`self.batch == None`) - fn batch(&self) -> &rocksdb::WriteBatch { - self.batch.as_ref().unwrap() - } - fn compare_keys(a: &[u8], b: &[u8]) -> i32 { use std::cmp::Ordering; use key_builder::KeyBuilder; diff --git a/src/json_shred.rs b/src/json_shred.rs index 08fca91..d766c40 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -4,8 +4,6 @@ extern crate rustc_serialize; use std::collections::HashMap; use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; -// Needed for a trait in order to `put()` into a `rocksdb::WriteBatch` -use self::rocksdb::Writable; use error::Error; use key_builder::KeyBuilder; @@ -66,7 +64,8 @@ impl Shredder { } } - fn add_entries(&mut self, text: &String, docseq: u64, batch: &rocksdb::WriteBatch) -> Result<(), Error> { + fn add_entries(&mut self, text: &String, docseq: u64, batch: &mut rocksdb::WriteBatch) -> + Result<(), Error> { let stems = Stems::new(text.as_str()); let mut word_to_word_infos = HashMap::new(); @@ -148,7 +147,8 @@ impl Shredder { Ok(()) } - pub fn shred(&mut self, json: &str, docseq: u64, batch: &rocksdb::WriteBatch) -> Result { + pub fn shred(&mut self, json: &str, docseq: u64, batch: &mut rocksdb::WriteBatch) -> + Result { let mut parser = Parser::new(json.chars()); let mut token = parser.next(); loop { @@ -197,12 +197,13 @@ impl Shredder { // Pop the dummy object that makes ObjectEnd happy // or the previous object key self.kb.pop_object_key(); - self.kb.push_object_key(&key);; + self.kb.push_object_key(&key); - try!(self.add_entries(&value, docseq, &batch)); + try!(self.add_entries(&value, docseq, batch)); + self.kb.inc_top_array_offset(); }, ObjectKeyTypes::NoKey => { - try!(self.add_entries(&value, docseq, &batch)); + try!(self.add_entries(&value, docseq, batch)); self.kb.inc_top_array_offset(); }, ObjectKeyTypes::Ignore => { diff --git a/tests/rocksdb.rs b/tests/rocksdb.rs index 8c9fe3c..4a80b9c 100644 --- a/tests/rocksdb.rs +++ b/tests/rocksdb.rs @@ -1,5 +1,5 @@ extern crate rocksdb; -use rocksdb::{DB, Writable}; +use rocksdb::{DB}; #[test] fn rocksdb_works() { From 0f3f92e99a374cc87ac38369b081f62f50925c0d Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 7 Dec 2016 12:39:31 -0800 Subject: [PATCH 039/122] Fix broken tests by making mut ref --- src/json_shred.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 06c21cf..2ef62f0 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -263,7 +263,7 @@ mod tests { let json = r#"{"some": ["array", "data", ["also", "nested"]]}"#; let docseq = 123; let batch = rocksdb::WriteBatch::default(); - shredder.shred(json, docseq, &batch).unwrap(); + shredder.shred(json, docseq, &mut batch).unwrap(); let rocks = rocksdb::DB::open_default("target/tests/test_shred_netsted").unwrap(); rocks.write(batch).unwrap(); @@ -292,7 +292,7 @@ mod tests { let json = r#"{"A":[{"B":"B2VMX two three","C":"..C2"},{"B": "b1","C":"..C2"}]}"#; let docseq = 1234; let batch = rocksdb::WriteBatch::default(); - shredder.shred(json, docseq, &batch).unwrap(); + shredder.shred(json, docseq, &mut batch).unwrap(); let rocks = rocksdb::DB::open_default("target/tests/test_shred_objects").unwrap(); rocks.write(batch).unwrap(); @@ -326,7 +326,7 @@ mod tests { let json = r#"{}"#; let docseq = 123; let batch = rocksdb::WriteBatch::default(); - shredder.shred(json, docseq, &batch).unwrap(); + shredder.shred(json, docseq, &mut batch).unwrap(); let rocks = rocksdb::DB::open_default("target/tests/test_shred_empty_object").unwrap(); rocks.write(batch).unwrap(); From 1fcf73a2f142168ae15d87e387f7a224266e15e5 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 13 Dec 2016 22:24:22 -0800 Subject: [PATCH 040/122] New example based syntax MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New find keyword and example based syntax where queries look like the json structure it’s finding. --- src/json_shred.rs | 6 +- src/query.rs | 472 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 354 insertions(+), 124 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 2ef62f0..7696126 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -262,7 +262,7 @@ mod tests { let mut shredder = super::Shredder::new(); let json = r#"{"some": ["array", "data", ["also", "nested"]]}"#; let docseq = 123; - let batch = rocksdb::WriteBatch::default(); + let mut batch = rocksdb::WriteBatch::default(); shredder.shred(json, docseq, &mut batch).unwrap(); let rocks = rocksdb::DB::open_default("target/tests/test_shred_netsted").unwrap(); @@ -291,7 +291,7 @@ mod tests { let mut shredder = super::Shredder::new(); let json = r#"{"A":[{"B":"B2VMX two three","C":"..C2"},{"B": "b1","C":"..C2"}]}"#; let docseq = 1234; - let batch = rocksdb::WriteBatch::default(); + let mut batch = rocksdb::WriteBatch::default(); shredder.shred(json, docseq, &mut batch).unwrap(); let rocks = rocksdb::DB::open_default("target/tests/test_shred_objects").unwrap(); @@ -325,7 +325,7 @@ mod tests { let mut shredder = super::Shredder::new(); let json = r#"{}"#; let docseq = 123; - let batch = rocksdb::WriteBatch::default(); + let mut batch = rocksdb::WriteBatch::default(); shredder.shred(json, docseq, &mut batch).unwrap(); let rocks = rocksdb::DB::open_default("target/tests/test_shred_empty_object").unwrap(); diff --git a/src/query.rs b/src/query.rs index ae14dcc..335b1bf 100644 --- a/src/query.rs +++ b/src/query.rs @@ -3,18 +3,20 @@ extern crate capnp; use std::str; +use std::cmp::Ordering; use error::Error; use index::Index; use key_builder::KeyBuilder; use stems::{StemmedWord, Stems}; + // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs use rocksdb::{self, DBIterator, IteratorMode, Snapshot}; use records_capnp::payload; - +#[derive(PartialEq, Eq, PartialOrd, Clone)] pub struct DocResult { pub seq: u64, pub arraypath: Vec, @@ -29,14 +31,19 @@ impl DocResult { } } - -impl PartialEq for DocResult { - fn eq(&self, other: &DocResult) -> bool { - self.seq == other.seq && self.arraypath == other.arraypath +impl Ord for DocResult { + fn cmp(&self, other: &DocResult) -> Ordering { + match self.seq.cmp(&other.seq) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => self.arraypath.cmp(&other.arraypath), + } } } - -impl Eq for DocResult {} +/* +impl Clone for DocResult { + fn clone(&self) -> DocResult { *self } +}*/ pub trait QueryRuntimeFilter { fn first_result(&mut self, start: &DocResult) -> Result, Error>; @@ -144,9 +151,9 @@ impl QueryRuntimeFilter for ExactMatchFilter { // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed // and why we pass on mutable reference of it to `read_message()` - let mut ref_value = &*value; + //let mut ref_value = &*value; let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + &mut &*value, ::capnp::message::ReaderOptions::new()).unwrap(); let payload = message_reader.get_root::().unwrap(); for wi in try!(payload.get_wordinfos()).iter() { @@ -240,6 +247,125 @@ impl<'a> QueryRuntimeFilter for AndFilter<'a> { } +struct FilterAndResult<'a> { + filter: Box, + result: Option, + is_done: bool, + array_depth: usize, +} + +impl<'a> FilterAndResult<'a> { + fn prime_first_result(&mut self, start: &DocResult) -> Result<(), Error> { + if self.is_done { + return Ok(()) + } + if self.result.is_none() { + self.result = try!(self.filter.first_result(start)); + } else if self.result.as_ref().unwrap() < start { + self.result = try!(self.filter.first_result(start)); + } + if self.result.is_none() { + self.is_done = true; + } else { + self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); + } + Ok(()) + } + + fn prime_next_result(&mut self) -> Result<(), Error> { + if self.is_done { + return Ok(()) + } + if self.result.is_none() { + self.result = try!(self.filter.next_result()); + } + if self.result.is_none() { + self.is_done = true; + } else { + self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); + } + Ok(()) + } +} + +struct OrFilter<'a> { + left: FilterAndResult<'a>, + right: FilterAndResult<'a>, +} + +impl<'a> OrFilter<'a> { + fn new(left: Box, + right: Box, + array_depth: usize) -> OrFilter<'a> { + OrFilter { + left: FilterAndResult{filter: left, + result: None, + array_depth: array_depth, + is_done: false, + }, + + right: FilterAndResult{filter: right, + result: None, + array_depth: array_depth, + is_done: false, + } + } + } + fn take_smallest(&mut self) -> Option { + if let Some(left) = self.left.result.take() { + // left exists + if let Some(right) = self.right.result.take() { + // both exist, return smallest + match left.cmp(&right) { + Ordering::Less => { + // left is smallest, return and put back right + self.right.result = Some(right); + Some(left) + }, + Ordering::Greater => { + // right is smallest, return and put back left + self.left.result = Some(left); + Some(right) + }, + Ordering::Equal => { + // return one and discard the other so we don't return + // identical result in a subsequent call + Some(left) + }, + } + } else { + // right doesn't exist. return left + Some(left) + } + } else { + // left doesn't exist + if self.right.result.is_some() { + // right exists. return it + self.right.result.take() + } else { + // neither exists. return none + None + } + } + } +} + +impl<'a> QueryRuntimeFilter for OrFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + try!(self.left.prime_first_result(start)); + try!(self.right.prime_first_result(start)); + Ok(self.take_smallest()) + } + + fn next_result(&mut self) -> Result, Error> { + try!(self.left.prime_next_result()); + try!(self.right.prime_next_result()); + Ok(self.take_smallest()) + } +} + + + struct Parser<'a> { query: String, @@ -250,7 +376,7 @@ struct Parser<'a> { impl<'a> Parser<'a> { fn new(query: String, snapshot: Snapshot<'a>) -> Parser<'a> { - Parser{ + Parser { query: query, offset: 0, kb: KeyBuilder::new(), @@ -258,7 +384,7 @@ impl<'a> Parser<'a> { } } - fn whitespace(&mut self) { + fn ws(&mut self) { for char in self.query[self.offset..].chars() { if !char.is_whitespace() { break; @@ -270,13 +396,25 @@ impl<'a> Parser<'a> { fn consume(&mut self, token: &str) -> bool { if self.could_consume(token) { self.offset += token.len(); - self.whitespace(); + self.ws(); true } else { false } } + + fn must_consume(&mut self, token: &str) -> Result<(), Error> { + if self.could_consume(token) { + self.offset += token.len(); + self.ws(); + Ok(()) + } else { + Err(Error::Parse(format!("Expected '{}' at character {}.", + token, self.offset))) + } + } + fn could_consume(&mut self, token: &str) -> bool { self.query[self.offset..].starts_with(token) } @@ -292,14 +430,14 @@ impl<'a> Parser<'a> { } if result.len() > 0 { self.offset += result.len(); - self.whitespace(); + self.ws(); Some(result) } else { None } } - fn consume_string_literal(&mut self) -> Result, Error> { + fn consume_string_literal(&mut self) -> Result { let mut lit = String::new(); let mut next_is_special_char = false; if self.could_consume("\"") { @@ -333,138 +471,230 @@ impl<'a> Parser<'a> { } } } - if self.consume("\"") { - Ok(Some(lit)) - } else { - Err(Error::Parse("Expected \"".to_string())) - } + try!(self.must_consume("\"")); + Ok(lit) } else { - Ok(None) + Err(Error::Parse("Expected \"".to_string())) } } - /* -This is a peg grammar that documents the calls of the recursive descent parser -is implemented. Can be checked here: http://pegjs.org/online -bool - = ws compare ws ('&' ws compare)* +find + = "find" ws object ws + +object + = "{" ws obool ws "}" ws (("&&" / "||") ws object)? + / parens + +parens + = "(" ws object ws ")" + +obool + = ws ocompare ws (('&&' / ',' / '||') ws obool)? + +ocompare + = oparens + / key ws ":" ws (oparens / compare) + +oparens + = '(' ws obool ws ')' ws + / array + / object + compare - = field ('.' field)* ws '=' ws string* / factor -factor - = '(' ws bool ws ')' ws / array + = ("==" / "~=" / "^=" ) ws string ws + + +abool + = ws acompare ws (('&&'/ ',' / '||') ws abool)? + +acompare + = aparens + / compare + +aparens + = '(' ws abool ')' ws + / array + / object + array - = '[' ws bool ']' ws + = '[' ws abool ']' ws + +key + = field / string + field - = [a-z]i+ ws + = [a-z_$]i [a-z_$0-9]i* + string - = '"' ('\\\\' / '\\' [\"tfvrnb] / [^\\\"])* '"' ws + = '"' ('\\\\' / '\\' [\"tfvrnb] / [^\\\"])* '"' ws + ws - = [ /\t/\r\n]* + = [ \t\n\r]* + +ws1 + = [ \t\n\r]+ */ - fn bool<'b>(&'b mut self) -> Result, Error> { - let left = try!(self.compare()); - let mut filters = vec![left]; - loop { - if !self.consume("&") { - break; - } - let right = try!(self.compare()); - filters.push(right); + fn find<'b>(&'b mut self) -> Result, Error> { + if !self.consume("find") { + return Err(Error::Parse("Missing 'find' keyword".to_string())); } - if filters.len() == 1 { - Ok(filters.pop().unwrap()) + self.object() + } + + fn object<'b>(&'b mut self) -> Result, Error> { + if self.consume("{") { + let left = try!(self.obool()); + try!(self.must_consume("}")); + + if self.consume("&&") { + let right = try!(self.object()); + Ok(Box::new(AndFilter::new(vec![left, right], self.kb.arraypath_len()))) + + } else if self.consume("||") { + let right = try!(self.object()); + Ok(Box::new(OrFilter::new(left, right, self.kb.arraypath_len()))) + } else { + Ok(left) + } } else { - Ok(Box::new(AndFilter::new(filters, self.kb.arraypath_len()))) + self.parens() } } + fn parens<'b>(&'b mut self) -> Result, Error> { + try!(self.must_consume("(")); + let filter = try!(self.object()); + try!(self.must_consume(")")); + Ok(filter) + } - fn array<'b>(&'b mut self) -> Result, Error> { - if !self.consume("[") { - return Err(Error::Parse("Expected '['".to_string())); - } - self.kb.push_array(); - let filter = try!(self.bool()); - self.kb.pop_array(); - if !self.consume("]") { - return Err(Error::Parse("Expected ']'".to_string())); + fn obool<'b>(&'b mut self) -> Result, Error> { + let mut filter = try!(self.ocompare()); + loop { + filter = if self.consume("&&") || self.consume(",") { + let right = try!(self.obool()); + Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) + } else if self.consume("||") { + let right = try!(self.obool()); + Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) + } else { + break; + } } Ok(filter) } - fn factor<'b>(&'b mut self) -> Result, Error> { - if self.consume("(") { - let filter = try!(self.bool()); - if !self.consume(")") { - Err(Error::Parse("Expected ')'".to_string())) + fn ocompare<'b>(&'b mut self) -> Result, Error> { + if let Some(filter) = try!(self.oparens()) { + Ok(filter) + } else if let Some(field) = self.consume_field() { + self.kb.push_object_key(&field); + try!(self.must_consume(":")); + if let Some(filter) = try!(self.oparens()) { + self.kb.pop_object_key(); + Ok(filter) } else { + let filter = try!(self.compare()); + self.kb.pop_object_key(); Ok(filter) } + } else { + Err(Error::Parse("Expected object key or '('".to_string())) + } + } + + fn oparens<'b>(&'b mut self) -> Result>, Error> { + if self.consume("(") { + let f = try!(self.obool()); + try!(self.must_consume(")")); + Ok(Some(f)) } else if self.could_consume("[") { - self.array() + Ok(Some(try!(self.array()))) + } else if self.could_consume("{") { + Ok(Some(try!(self.object()))) } else { - Err(Error::Parse("Missing Expression".to_string())) + Ok(None) } } fn compare<'b>(&'b mut self) -> Result, Error> { - match self.consume_field() { - Some(field) => { - if self.consume(".") { - self.kb.push_object_key(&field); - let ret = self.compare(); - self.kb.pop_object_key(); - ret - } else if self.consume("=") { - match self.consume_string_literal() { - Ok(Some(literal)) => { - self.kb.push_object_key(&field); - - let stems = Stems::new(&literal); - let mut filters: Vec> = Vec::new(); - for stem in stems { - let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = Box::new(ExactMatchFilter::new( - iter, &stem, self.kb.clone())); - filters.push(filter); - } - - self.kb.pop_object_key(); - - match filters.len() { - 0 => panic!("Cannot create a ExactMatchFilter"), - 1 => Ok(filters.pop().unwrap()), - _ => Ok(Box::new(AndFilter::new( - filters, self.kb.arraypath_len()))), - } - }, - // Empty literal - Ok(None) => {Err(Error::Parse("Expected string".to_string()))}, - Err(error) => { - Err(error) - } - } - } else if self.could_consume("[") { - self.kb.push_object_key(&field); - let ret = self.array(); - self.kb.pop_object_key(); - ret - } else { - Err(Error::Parse("Expected comparison or array operator".to_string())) - } - }, - None => { - self.factor() + if self.consume("==") { + let literal = try!(self.consume_string_literal()); + let stems = Stems::new(&literal); + let mut filters: Vec> = Vec::new(); + for stem in stems { + let iter = self.snapshot.iterator(IteratorMode::Start); + let filter = Box::new(ExactMatchFilter::new( + iter, &stem, self.kb.clone())); + filters.push(filter); + } + match filters.len() { + 0 => panic!("Cannot create a ExactMatchFilter"), + 1 => Ok(filters.pop().unwrap()), + _ => Ok(Box::new(AndFilter::new( + filters, self.kb.arraypath_len()))), } + } else { + Err(Error::Parse("Expected comparison operator".to_string())) } } + fn abool<'b>(&'b mut self) -> Result, Error> { + let mut filter = try!(self.acompare()); + loop { + filter = if self.consume("&&") || self.consume(",") { + let right = try!(self.abool()); + Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) + } else if self.consume("||") { + let right = try!(self.abool()); + Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) + } else { + break; + } + } + Ok(filter) + } + + fn acompare<'b>(&'b mut self) -> Result, Error> { + if let Some(filter) = try!(self.aparens()) { + Ok(filter) + } else { + self.compare() + } + } + + fn aparens<'b>(&'b mut self) -> Result>, Error> { + if self.consume("(") { + let f = try!(self.abool()); + try!(self.must_consume(")")); + Ok(Some(f)) + } else if self.could_consume("[") { + Ok(Some(try!(self.array()))) + } else if self.could_consume("{") { + Ok(Some(try!(self.object()))) + } else { + Ok(None) + } + } + + fn array<'b>(&'b mut self) -> Result, Error> { + if !self.consume("[") { + return Err(Error::Parse("Expected '['".to_string())); + } + self.kb.push_array(); + let filter = try!(self.abool()); + self.kb.pop_array(); + try!(self.must_consume("]")); + Ok(filter) + } + + fn build_filter(mut self) -> Result<(Box, Snapshot<'a>), Error> { - self.whitespace(); - Ok((self.bool().unwrap(), self.snapshot)) + self.ws(); + Ok((try!(self.find()), self.snapshot)) } } @@ -501,13 +731,13 @@ mod tests { let mut query = " \n \t test".to_string(); let mut parser = Parser::new(query, snapshot); - parser.whitespace(); + parser.ws(); assert_eq!(parser.offset, 5); snapshot = Snapshot::new(rocks); query = "test".to_string(); parser = Parser::new(query, snapshot); - parser.whitespace(); + parser.ws(); assert_eq!(parser.offset, 0); } @@ -520,7 +750,7 @@ mod tests { let query = r#"" \n \t test""#.to_string(); let mut parser = Parser::new(query, snapshot); - assert_eq!(parser.consume_string_literal().unwrap().unwrap(), " \n \t test".to_string()); + assert_eq!(parser.consume_string_literal().unwrap(), " \n \t test".to_string()); } #[test] @@ -533,7 +763,7 @@ mod tests { let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#); index.flush().unwrap(); - let mut query_results = Query::get_matches(r#"hello="world""#.to_string(), &index).unwrap(); + let mut query_results = Query::get_matches(r#"find {hello:=="world"}"#.to_string(), &index).unwrap(); //let mut query_results = Query::get_matches(r#"a.b[foo="bar"]"#.to_string(), &index).unwrap(); println!("query results: {:?}", query_results.get_next_id()); } @@ -553,32 +783,32 @@ mod tests { index.flush().unwrap(); - let mut query_results = Query::get_matches(r#"A[B = "B2" & C[ D = "D" ]]"#.to_string(), &index).unwrap(); + let mut query_results = Query::get_matches(r#"find {A:[{B: =="B2", C: [{D: =="D"} ]}]}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); - query_results = Query::get_matches(r#"A[B = "B2" & C = "C2"]"#.to_string(), &index).unwrap(); + query_results = Query::get_matches(r#"find {A:[{B: == "B2", C: == "C2"}]}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); - query_results = Query::get_matches(r#"A[B = "b1" & C = "C2"]"#.to_string(), &index).unwrap(); + query_results = Query::get_matches(r#"find {A:[{B: == "b1", C: == "C2"}]}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); - query_results = Query::get_matches(r#"A = "Multi word sentence""#.to_string(), &index).unwrap(); + query_results = Query::get_matches(r#"find {A: == "Multi word sentence"}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); - query_results = Query::get_matches(r#"A = "%&%}{}@);€""#.to_string(), &index).unwrap(); + query_results = Query::get_matches(r#"find {A: == "%&%}{}@);€"}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("4".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); - query_results = Query::get_matches(r#"A = "{}€52 deeply \\n\\v ""#.to_string(), &index).unwrap(); + query_results = Query::get_matches(r#"find {A: == "{}€52 deeply \\n\\v "}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("5".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); - query_results = Query::get_matches(r#"A[C = "C2"]"#.to_string(), &index).unwrap(); + query_results = Query::get_matches(r#"find {A:[{C: == "C2"}]}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); @@ -598,7 +828,7 @@ mod tests { } index.flush().unwrap(); - let mut query_results = Query::get_matches(r#"data = "u""#.to_string(), &index).unwrap(); + let mut query_results = Query::get_matches(r#"find {data: == "u"}"#.to_string(), &index).unwrap(); loop { match query_results.get_next_id() { Ok(Some(result)) => println!("result: {}", result), From bf29a1bdd57a580b50632cc222f2faf7133d553a Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 13 Dec 2016 22:36:26 -0800 Subject: [PATCH 041/122] Add test for || (or) Operator --- src/query.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/query.rs b/src/query.rs index 335b1bf..886ae47 100644 --- a/src/query.rs +++ b/src/query.rs @@ -780,6 +780,10 @@ mod tests { let _ = index.add(r#"{"_id":"3", "A":"Multi word sentence"}"#); let _ = index.add(r#"{"_id":"4", "A":"%&%}{}@);€"}"#); let _ = index.add(r#"{"_id":"5", "A":"{}€52 deeply \\n\\v "}"#); + let _ = index.add(r#"{"_id":"6", "A":[{"B":"B3"},{"B": "B3"}]}"#); + let _ = index.add(r#"{"_id":"7", "A":[{"B":"B3"},{"B": "B4"}]}"#); + let _ = index.add(r#"{"_id":"8", "A":["A1", "A1"]}"#); + let _ = index.add(r#"{"_id":"9", "A":["A1", "A2"]}"#); index.flush().unwrap(); @@ -812,6 +816,17 @@ mod tests { assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[{B: == "B3" || B: == "B4"}]}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("6".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), Some("7".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[ == "A1" || == "A2"]}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("8".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), Some("9".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + } #[test] From 865ec4c48c9a85bc20e57b563254885eadb14b69 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 14 Dec 2016 17:44:18 -0800 Subject: [PATCH 042/122] Change potentially confusing name of struct to FilterWithResult MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes name from FilterAndResult to FilterWithResult. Because it’s by the OrFilter, but could have easily been mistaken for a component of the AndFilter. --- src/query.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/query.rs b/src/query.rs index 886ae47..0c7965b 100644 --- a/src/query.rs +++ b/src/query.rs @@ -246,15 +246,17 @@ impl<'a> QueryRuntimeFilter for AndFilter<'a> { } } - -struct FilterAndResult<'a> { +/// Used by OrFilter to maintain a already fetched result so we don't refetch when one side isn't +/// returned to caller. Because we won't know which side gets returned until both sides are +/// fetched. +struct FilterWithResult<'a> { filter: Box, result: Option, is_done: bool, array_depth: usize, } -impl<'a> FilterAndResult<'a> { +impl<'a> FilterWithResult<'a> { fn prime_first_result(&mut self, start: &DocResult) -> Result<(), Error> { if self.is_done { return Ok(()) @@ -289,8 +291,8 @@ impl<'a> FilterAndResult<'a> { } struct OrFilter<'a> { - left: FilterAndResult<'a>, - right: FilterAndResult<'a>, + left: FilterWithResult<'a>, + right: FilterWithResult<'a>, } impl<'a> OrFilter<'a> { @@ -298,13 +300,13 @@ impl<'a> OrFilter<'a> { right: Box, array_depth: usize) -> OrFilter<'a> { OrFilter { - left: FilterAndResult{filter: left, + left: FilterWithResult{filter: left, result: None, array_depth: array_depth, is_done: false, }, - right: FilterAndResult{filter: right, + right: FilterWithResult{filter: right, result: None, array_depth: array_depth, is_done: false, From c3c34317d75ab7f5f4b7e25a9cd3e96c8ff4b4d7 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 15 Dec 2016 17:34:41 -0800 Subject: [PATCH 043/122] Optimization to avoid string allocation/construction in filter first_result MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allow the string to not reconstruct the parts that won’t change (the keypathword) while allowing to add and remove the seq and array path as necessary with no extra allocations. --- src/key_builder.rs | 47 +++++++++++++++++++++++----------------------- src/query.rs | 8 ++++---- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/key_builder.rs b/src/key_builder.rs index c79a125..b8bde89 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -17,20 +17,7 @@ impl KeyBuilder { } } - /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath - /// built up internally. - pub fn stemmed_word_key(&self, word: &str, seq: u64) -> String { - self.stemmed_word_key_internal(word, seq, &self.arraypath) - } - - /// Builds a stemmed word key for the input word and doc result, using the key_path built up - /// internally but ignoring the internal array path. Instead uses the array path from the - /// DocResult - pub fn stemmed_word_key_from_doc_result(&self, word: &str, dr: &DocResult) -> String { - self.stemmed_word_key_internal(word, dr.seq, &dr.arraypath) - } - - fn stemmed_word_key_internal(&self, word: &str, seq: u64, arraypath: &Vec) -> String { + pub fn get_keypathword_only(&self, word: &str) -> String { let mut string = String::with_capacity(100); string.push('W'); for segment in &self.keypath { @@ -39,12 +26,31 @@ impl KeyBuilder { string.push('!'); string.push_str(word); string.push('#'); + string + } + + /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath + /// built up internally. + pub fn stemmed_word_key(&self, word: &str, seq: u64) -> String { + let mut string = self.get_keypathword_only(&word); string.push_str(seq.to_string().as_str()); - self.add_arraypath(&mut string, &arraypath); + KeyBuilder::add_arraypath(&mut string, &self.arraypath); string } + /// Adds DocResult seq and array path an already created keypathword. + pub fn add_doc_result_to_keypathword(keypathword: &mut String, dr: &DocResult) { + keypathword.push_str(dr.seq.to_string().as_str()); + KeyBuilder::add_arraypath(keypathword, &dr.arraypath); + } + + pub fn truncate_to_keypathword(stemmed_word_key: &mut String) { + let n = stemmed_word_key.rfind("#").unwrap(); + stemmed_word_key.truncate(n + 1); + } + + /// Builds a value key for seq (value keys are the original json terminal value with /// keyed on keypath and arraypath built up internally). pub fn value_key(&self, seq: u64) -> String { @@ -56,11 +62,11 @@ impl KeyBuilder { string.push('#'); string.push_str(&seq.to_string()); - self.add_arraypath(&mut string, &self.arraypath); + KeyBuilder::add_arraypath(&mut string, &self.arraypath); string } - fn add_arraypath(&self, string: &mut String, arraypath: &Vec) { + fn add_arraypath(string: &mut String, arraypath: &Vec) { if arraypath.is_empty() { string.push(','); } else { @@ -131,13 +137,6 @@ impl KeyBuilder { (&str[..n], &seq_arraypath_str[..m], &seq_arraypath_str[m + 1..]) } - pub fn get_keypathword_only(&self, stemmed: &str) -> String { - let mut key = self.stemmed_word_key(stemmed, 0); - let n = key.rfind("#").unwrap(); - key.truncate(n + 1); - key - } - /* parses a seq and array path portion (ex "123,0,0,10) of a key into a doc result */ pub fn parse_doc_result_from_key(str: &str) -> DocResult { let mut dr = DocResult::new(); diff --git a/src/query.rs b/src/query.rs index 0c7965b..5d6a44f 100644 --- a/src/query.rs +++ b/src/query.rs @@ -97,7 +97,6 @@ impl<'a> QueryResults<'a> { struct ExactMatchFilter { iter: DBIterator, - kb: KeyBuilder, keypathword: String, stemmed: String, stemmed_offset: u64, @@ -110,7 +109,6 @@ impl ExactMatchFilter { let keypathword = kb.get_keypathword_only(&stemmed_word.stemmed); ExactMatchFilter{ iter: iter, - kb: kb, keypathword: keypathword, stemmed: stemmed_word.stemmed.clone(), stemmed_offset: stemmed_word.stemmed_offset as u64, @@ -123,10 +121,12 @@ impl ExactMatchFilter { impl QueryRuntimeFilter for ExactMatchFilter { fn first_result(&mut self, start: &DocResult) -> Result, Error> { - let key = self.kb.stemmed_word_key_from_doc_result(&self.stemmed, &start); + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(key.as_bytes(), + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), rocksdb::Direction::Forward)); + + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); self.next_result() } From d35b11ef9b78a65f0184766ecddbaccff465d3fe Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 17 Dec 2016 23:52:52 -0800 Subject: [PATCH 044/122] new query operator ~= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This query operator gives us full text matching. When used like this: {foo: ~= “word”} Searches field foo for any occurrence of stemmed “word”. This: {foo: ~= “some phrase”} Searches field foo for any occurrence of stemmed phrase “some phrase”. This: {foo: ~10= “warriors basketball win”} Searches field foo for any occurrence of stemmed words “warriors” “basketball” “win” which are all separated from each other by 10 words or less. --- .gitignore | 3 + capnp/records.capnp | 28 ++- src/error.rs | 7 + src/json_shred.rs | 32 +-- src/query.rs | 508 +++++++++++++++++++++++++++++++++++++++----- src/stems.rs | 76 ++++--- 6 files changed, 556 insertions(+), 98 deletions(-) diff --git a/.gitignore b/.gitignore index 0aa91de..aacb5b4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ Cargo.lock **/*.iml .idea/ + +.DS_Store + diff --git a/capnp/records.capnp b/capnp/records.capnp index 6cbed40..ccd3d7d 100644 --- a/capnp/records.capnp +++ b/capnp/records.capnp @@ -1,15 +1,37 @@ -@0x9266127bb5310c6c; +@0x89d4fcde0ae482cb; struct Header { version @0 :UInt64; highSeq @1 :UInt64; } +enum Case { + uppercase @0; + propercase @1; +} + struct Payload { - struct Wordinfo { - stemmedOffset @0 :UInt64; + + struct Wordinfo { + # Contains stemmed word and information about the orignal word before stemming + + # the position of the word in the text field + wordPos @0 :UInt64; + + # the offset of the suffix from the start of the stemmed word + # when combined with the stemmed word gets back the orignal + # text with case preserved suffixOffset @1 :UInt64; + + # the actual suffix text, which can start at any point in the stemmed word suffixText @2 :Text; + + # NOTE: at some point we should contain bit flags that indicate if the original string + # was propercase, all uppercase, contains a trailing space, a trailing period, + # a trailing period and space, etc up to 8 flags. This would mean less information would + # need to be stored in the suffix text for most words at the cost of 1 byte per word + # info. } wordinfos @0 :List(Wordinfo); } + diff --git a/src/error.rs b/src/error.rs index 62af691..02783a2 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,6 +2,7 @@ extern crate capnp; extern crate rocksdb; use std::{error, fmt}; +use std::num::ParseIntError; #[derive(Debug)] pub enum Error { @@ -48,6 +49,12 @@ impl From for Error { } } +impl From for Error { + fn from(err: ParseIntError) -> Error { + Error::Parse(err.to_string()) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { diff --git a/src/json_shred.rs b/src/json_shred.rs index 7696126..dee9eac 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -18,7 +18,7 @@ use stems::Stems; #[derive(Debug, PartialEq)] struct WordInfo { //offset in the text field where the stemmed text starts - stemmed_offset: u64, + word_pos: u64, // the suffix of the stemmed text. When applied over stemmed, the original // text is returned. @@ -72,7 +72,7 @@ impl Shredder { for stem in stems { let word_infos = word_to_word_infos.entry(stem.stemmed).or_insert(Vec::new()); word_infos.push(WordInfo{ - stemmed_offset: stem.stemmed_offset as u64, + word_pos: stem.word_pos as u64, suffix_text: stem.suffix.to_string(), suffix_offset: stem.suffix_offset as u64, }); @@ -84,7 +84,7 @@ impl Shredder { let mut capn_wordinfos = capn_payload.init_wordinfos(word_infos.len() as u32); for (pos, word_info) in word_infos.iter().enumerate() { let mut capn_wordinfo = capn_wordinfos.borrow().get(pos as u32); - capn_wordinfo.set_stemmed_offset(word_info.stemmed_offset); + capn_wordinfo.set_word_pos(word_info.word_pos); capn_wordinfo.set_suffix_text(&word_info.suffix_text); capn_wordinfo.set_suffix_offset(word_info.suffix_offset); } @@ -245,7 +245,7 @@ mod tests { let mut wordinfos = Vec::new(); for wi in payload.get_wordinfos().unwrap().iter() { wordinfos.push(WordInfo{ - stemmed_offset: wi.get_stemmed_offset(), + word_pos: wi.get_word_pos(), suffix_text: wi.get_suffix_text().unwrap().to_string(), suffix_offset: wi.get_suffix_offset(), }); @@ -271,13 +271,13 @@ mod tests { let expected = vec![ ("W.some$!array#123,0".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 5 }]), + WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 5 }]), ("W.some$!data#123,1".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), + WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), ("W.some$$!also#123,2,0".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), + WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), ("W.some$$!nest#123,2,1".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "ed".to_string(), suffix_offset: 4 }]), + WordInfo { word_pos: 0, suffix_text: "ed".to_string(), suffix_offset: 4 }]), ]; assert_eq!(result, expected); } @@ -300,22 +300,22 @@ mod tests { println!("result: {:?}", result); let expected = vec![ ("W.A$.B!b1#1234,1".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), + WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), ("W.A$.B!b2vmx#1234,0".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "B2 VMX ".to_string(), + WordInfo { word_pos: 0, suffix_text: "B2 VMX ".to_string(), suffix_offset: 0 }]), ("W.A$.B!three#1234,0".to_string(), vec![ - WordInfo { stemmed_offset: 10, suffix_text: "".to_string(), suffix_offset: 15 }]), + WordInfo { word_pos: 10, suffix_text: "".to_string(), suffix_offset: 15 }]), ("W.A$.B!two#1234,0".to_string(), vec![ - WordInfo { stemmed_offset: 6, suffix_text: " ".to_string(), suffix_offset: 9 }]), + WordInfo { word_pos: 6, suffix_text: " ".to_string(), suffix_offset: 9 }]), ("W.A$.C!..#1234,0".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), + WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), ("W.A$.C!..#1234,1".to_string(), vec![ - WordInfo { stemmed_offset: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), + WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), ("W.A$.C!c2#1234,0".to_string(), vec![ - WordInfo { stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), + WordInfo { word_pos: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), ("W.A$.C!c2#1234,1".to_string(), vec![ - WordInfo { stemmed_offset: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), + WordInfo { word_pos: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), ]; assert_eq!(result, expected); } diff --git a/src/query.rs b/src/query.rs index 5d6a44f..3dc5098 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,9 +1,12 @@ #![allow(dead_code)] #![allow(unused_variables)] + extern crate capnp; use std::str; use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::collections::HashSet; use error::Error; use index::Index; @@ -98,20 +101,17 @@ impl<'a> QueryResults<'a> { struct ExactMatchFilter { iter: DBIterator, keypathword: String, - stemmed: String, - stemmed_offset: u64, + word_pos: u64, suffix: String, suffix_offset: u64, } impl ExactMatchFilter { - fn new(iter: DBIterator, stemmed_word: &StemmedWord, kb: KeyBuilder) -> ExactMatchFilter { - let keypathword = kb.get_keypathword_only(&stemmed_word.stemmed); + fn new(iter: DBIterator, stemmed_word: &StemmedWord, kb: &KeyBuilder) -> ExactMatchFilter { ExactMatchFilter{ iter: iter, - keypathword: keypathword, - stemmed: stemmed_word.stemmed.clone(), - stemmed_offset: stemmed_word.stemmed_offset as u64, + keypathword: kb.get_keypathword_only(&stemmed_word.stemmed), + word_pos: stemmed_word.word_pos as u64, suffix: stemmed_word.suffix.clone(), suffix_offset: stemmed_word.suffix_offset as u64, } @@ -137,49 +137,345 @@ impl QueryRuntimeFilter for ExactMatchFilter { return Ok(None) } - // New scope needed as the iter.next() below invalidates the - // current key and value - { - let (key, value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return Ok(None), - }; - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return Ok(None) + let (key, value) = match self.iter.next() { + Some((key, value)) => (key, value), + None => return Ok(None), + }; + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return Ok(None) + } + + // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed + // and why we pass on mutable reference of it to `read_message()` + let mut ref_value = &*value; + let message_reader = ::capnp::serialize_packed::read_message( + &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + let payload = message_reader.get_root::().unwrap(); + + for wi in try!(payload.get_wordinfos()).iter() { + if self.word_pos == wi.get_word_pos() && + self.suffix_offset == wi.get_suffix_offset() && + self.suffix == try!(wi.get_suffix_text()) { + // We have a candidate document to return + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + return Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))); + } + } + } + } +} + +struct StemmedWordFilter { + iter: DBIterator, + keypathword: String, +} + +impl StemmedWordFilter { + fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordFilter { + StemmedWordFilter { + iter: iter, + keypathword: kb.get_keypathword_only(&stemmed_word), + } + } +} + +impl QueryRuntimeFilter for StemmedWordFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), + rocksdb::Direction::Forward)); + + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + + self.next_result() + } + + fn next_result(&mut self) -> Result, Error> { + if !self.iter.valid() { + return Ok(None) + } + + let (key, value) = match self.iter.next() { + Some((key, value)) => (key, value), + None => return Ok(None), + }; + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return Ok(None) + } + + // We have a candidate document to return + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))) + } +} + +/// This is not a QueryRuntimeFilter but it imitates one. Instead of returning just a DocResult +/// it also return a vector of word positions, each being a instance of the word occurance +struct StemmedWordPosFilter { + iter: DBIterator, + keypathword: String, +} + +impl StemmedWordPosFilter { + fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordPosFilter { + StemmedWordPosFilter{ + iter: iter, + keypathword: kb.get_keypathword_only(&stemmed_word), + } + } + + fn first_result(&mut self, + start: &DocResult) -> Result)>, Error> { + + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), + rocksdb::Direction::Forward)); + + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + + self.next_result() + } + + fn next_result(&mut self) -> Result)>, Error> { + if !self.iter.valid() { + return Ok(None) + } + + let (key, value) = match self.iter.next() { + Some((key, value)) => (key, value), + None => return Ok(None), + }; + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return Ok(None) + } + let mut ref_value = &*value; + let message_reader = ::capnp::serialize_packed::read_message( + &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + let payload = message_reader.get_root::().unwrap(); + + let positions = try!(payload.get_wordinfos()) + .iter() + .map(|wi| wi.get_word_pos()as i64) + .collect(); + + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let docresult = KeyBuilder::parse_doc_result_from_key(&key_str); + + Ok(Some((docresult, positions))) + } +} + +struct StemmedPhraseFilter { + filters: Vec, +} + +impl StemmedPhraseFilter { + fn new(filters: Vec) -> StemmedPhraseFilter { + StemmedPhraseFilter { + filters: filters, + } + } + + fn result(&mut self, + base: Option<(DocResult, Vec)>) -> Result, Error> { + // this is the number of matches left before all terms match and we can return a result + let mut matches_left = self.filters.len() - 1; + + if base.is_none() { return Ok(None); } + let (mut base_result, mut base_positions) = base.unwrap(); + + let mut current_filter = 0; + loop { + current_filter += 1; + if current_filter == self.filters.len() { + current_filter = 0; + } + + let next = try!(self.filters[current_filter].first_result(&base_result)); + + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + + if base_result == next_result { + let mut new_positions = Vec::new(); + for &pos in next_positions.iter() { + if let Ok(_) = base_positions.binary_search(&(pos-1)) { + new_positions.push(pos); + } } + if new_positions.len() > 0 { + // we have valus that survive! reassign back to base_positions + base_positions = new_positions; + matches_left -= 1; - // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed - // and why we pass on mutable reference of it to `read_message()` - //let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut &*value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - for wi in try!(payload.get_wordinfos()).iter() { - if self.stemmed_offset == wi.get_stemmed_offset() && - self.suffix_offset == wi.get_suffix_offset() && - self.suffix == try!(wi.get_suffix_text()) { - // We have a candidate document to return - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - return Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))); + if matches_left == 0 { + return Ok(Some(base_result)); } + } else { + // we didn't match on phrase, so get next_result from first filter + current_filter = 0; + let next = try!(self.filters[current_filter].next_result()); + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions; + + matches_left = self.filters.len() - 1; } + } else { + // we didn't match on next_result, so get first_result at next_result on + // 1st filter. + current_filter = 0; + let next = try!(self.filters[current_filter].first_result(&next_result)); + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions; + + matches_left = self.filters.len() - 1; } } } } +impl QueryRuntimeFilter for StemmedPhraseFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + let base_result = try!(self.filters[0].first_result(start)); + self.result(base_result) + } + + fn next_result(&mut self) -> Result, Error> { + let base_result = try!(self.filters[0].next_result()); + self.result(base_result) + } +} -struct DummyFilter {} +struct DistanceFilter { + filters: Vec, + current_filter: usize, + distance: i64, +} -impl QueryRuntimeFilter for DummyFilter { +impl DistanceFilter { + fn new(filters: Vec, distance: i64) -> DistanceFilter { + DistanceFilter { + filters: filters, + current_filter: 0, + distance: distance, + } + } + + fn result(&mut self, + base: Option<(DocResult, Vec)>) -> Result, Error> { + // yes this code complex. I tried to break it up, but it wants to be like this. + + // this is the number of matches left before all terms match and we can return a result + let mut matches_left = self.filters.len() - 1; + + if base.is_none() { return Ok(None); } + let (mut base_result, positions) = base.unwrap(); + + // This contains tuples of word postions and the filter they came from, + // sorted by word position. + let mut base_positions: Vec<(i64, usize)> = positions.iter() + .map(|pos|(*pos, self.current_filter)) + .collect(); + + // distance is number of words between searched words. + // add one to make calculating difference easier since abs(posa - posb) == distance + 1 + let dis = self.distance + 1; + loop { + self.current_filter += 1; + if self.current_filter == self.filters.len() { + self.current_filter = 0; + } + + let next = try!(self.filters[self.current_filter].first_result(&base_result)); + + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + + if base_result == next_result { + // so we are in the same field. Now to check the proximity of the values from the + // next result to previous results. + + // new_positions_map will accept positions within range of pos. But only if all + // positions that can be are within range. We use the sorted map so we can add + // the same positions multiple times and it's a noop. + let mut new_positions_map = BTreeMap::new(); + for &pos in next_positions.iter() { + // coud these lines be any longer? No they could not. + let start = match base_positions.binary_search_by_key(&(pos-dis), + |&(pos2,_)| pos2) { + Ok(start) => start, + Err(start) => start, + }; + + let end = match base_positions.binary_search_by_key(&(pos+dis), + |&(pos2,_)| pos2) { + Ok(end) => end, + Err(end) => end, + }; + + // we now collect all the filters within the range + let mut filters_encountered = HashSet::new(); + for &(_, filter_n) in base_positions[start..end].iter() { + filters_encountered.insert(filter_n); + } + + if filters_encountered.len() == self.filters.len() - matches_left { + // we encountered all the filters we can at this stage, + // so we should add them all to the new_positions_map + for &(prev_pos, filter_n) in base_positions[start..end].iter() { + new_positions_map.insert(prev_pos, filter_n); + } + // and add the current pos + new_positions_map.insert(pos, self.current_filter); + } + } + if new_positions_map.len() > 0 { + // we have valus that survive! reassign back to positions + base_positions = new_positions_map.into_iter().collect(); + matches_left -= 1; + + if matches_left == 0 { + return Ok(Some(base_result)); + } else { + continue; + } + } + } + // we didn't match on next_result, so get next_result on current filter + let next = try!(self.filters[self.current_filter].next_result()); + + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions.iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); + + matches_left = self.filters.len() - 1; + } + } +} + +impl QueryRuntimeFilter for DistanceFilter { fn first_result(&mut self, start: &DocResult) -> Result, Error> { - Ok(None) + let base_result = try!(self.filters[self.current_filter].first_result(start)); + self.result(base_result) } + fn next_result(&mut self) -> Result, Error> { - Ok(None) + let base_result = try!(self.filters[self.current_filter].next_result()); + self.result(base_result) } } @@ -197,15 +493,14 @@ impl<'a> AndFilter<'a> { current_filter: 0, array_depth: array_depth, } - } + } fn result(&mut self, base: Option) -> Result, Error> { let mut matches_count = self.filters.len() - 1; - // TODO vmx 2016-11-04: Make it nicer - let mut base_result = match base { - Some(base_result) => base_result, - None => return Ok(None), - }; + + if base.is_none() { return Ok(None); } + let mut base_result = base.unwrap(); + base_result.arraypath.resize(self.array_depth, 0); loop { @@ -215,10 +510,10 @@ impl<'a> AndFilter<'a> { } let next = try!(self.filters[self.current_filter].first_result(&base_result)); - let mut next_result = match next { - Some(next_result) => next_result, - None => return Ok(None), - }; + + if next.is_none() { return Ok(None); } + let mut next_result = next.unwrap(); + next_result.arraypath.resize(self.array_depth, 0); if base_result == next_result { @@ -439,6 +734,24 @@ impl<'a> Parser<'a> { } } + fn consume_integer(&mut self) -> Result, Error> { + let mut result = String::new(); + for char in self.query[self.offset..].chars() { + if char >= '0' && char <= '9' { + result.push(char); + } else { + break; + } + } + if !result.is_empty() { + self.offset += result.len(); + self.ws(); + Ok(Some(try!(result.parse()))) + } else { + Ok(None) + } + } + fn consume_string_literal(&mut self) -> Result { let mut lit = String::new(); let mut next_is_special_char = false; @@ -504,8 +817,7 @@ oparens / object compare - = ("==" / "~=" / "^=" ) ws string ws - + = ("==" / "~=" / "~" digits "=" ) ws string ws abool = ws acompare ws (('&&'/ ',' / '||') ws abool)? @@ -531,6 +843,9 @@ field string = '"' ('\\\\' / '\\' [\"tfvrnb] / [^\\\"])* '"' ws +digits + = [0-9]+ + ws = [ \t\n\r]* @@ -630,14 +945,57 @@ ws1 for stem in stems { let iter = self.snapshot.iterator(IteratorMode::Start); let filter = Box::new(ExactMatchFilter::new( - iter, &stem, self.kb.clone())); + iter, &stem, &self.kb)); filters.push(filter); } match filters.len() { 0 => panic!("Cannot create a ExactMatchFilter"), 1 => Ok(filters.pop().unwrap()), - _ => Ok(Box::new(AndFilter::new( - filters, self.kb.arraypath_len()))), + _ => Ok(Box::new(AndFilter::new(filters, self.kb.arraypath_len()))), + } + } else if self.consume("~=") { + // regular search + let literal = try!(self.consume_string_literal()); + let stems = Stems::new(&literal); + let stemmed_words: Vec = stems.map(|stem| stem.stemmed).collect(); + + match stemmed_words.len() { + 0 => panic!("Cannot create a StemmedWordFilter"), + 1 => { + let iter = self.snapshot.iterator(IteratorMode::Start); + Ok(Box::new(StemmedWordFilter::new(iter, &stemmed_words[0], &self.kb))) + }, + _ => { + let mut filters: Vec = Vec::new(); + for stemmed_word in stemmed_words { + let iter = self.snapshot.iterator(IteratorMode::Start); + let filter = StemmedWordPosFilter::new(iter, &stemmed_word, &self.kb); + filters.push(filter); + } + Ok(Box::new(StemmedPhraseFilter::new(filters))) + }, + } + } else if self.consume("~") { + let word_distance = match try!(self.consume_integer()) { + Some(int) => int, + None => { + return Err(Error::Parse("Expected integer for proximity search".to_string())); + }, + }; + try!(self.must_consume("=")); + + let literal = try!(self.consume_string_literal()); + let stems = Stems::new(&literal); + let mut filters: Vec = Vec::new(); + for stem in stems { + let iter = self.snapshot.iterator(IteratorMode::Start); + let filter = StemmedWordPosFilter::new( + iter, &stem.stemmed, &self.kb); + filters.push(filter); + } + match filters.len() { + 0 => panic!("Cannot create a DistanceFilter"), + _ => Ok(Box::new(DistanceFilter::new(filters, word_distance))), } } else { Err(Error::Parse("Expected comparison operator".to_string())) @@ -786,6 +1144,8 @@ mod tests { let _ = index.add(r#"{"_id":"7", "A":[{"B":"B3"},{"B": "B4"}]}"#); let _ = index.add(r#"{"_id":"8", "A":["A1", "A1"]}"#); let _ = index.add(r#"{"_id":"9", "A":["A1", "A2"]}"#); + let _ = index.add(r#"{"_id":"10", "A":"a bunch of words in this sentence"}"#); + let _ = index.add(r#"{"_id":"11", "A":""}"#); index.flush().unwrap(); @@ -797,6 +1157,16 @@ mod tests { assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); + query_results = Query::get_matches(r#"find {A:[{B: == "B2", C: == "C8"}]}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), None); + + let (mut x, mut y) = (1, 2); + x = x + 1; + y = y + 1; + let (x, v) = (x+1, y+1); + + assert_eq!(x, 3); + query_results = Query::get_matches(r#"find {A:[{B: == "b1", C: == "C2"}]}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); @@ -828,6 +1198,44 @@ mod tests { assert_eq!(query_results.get_next_id().unwrap(), Some("8".to_string())); assert_eq!(query_results.get_next_id().unwrap(), Some("9".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~= "Multi"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~= "multi word"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~= "word sentence"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~= "sentence word"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~1= "multi sentence"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~4= "a sentence"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~5= "a sentence"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~4= "a bunch of words sentence"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A: ~5= "a bunch of words sentence"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + + query_results = Query::get_matches(r#"find {A: == ""}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("11".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); } diff --git a/src/stems.rs b/src/stems.rs index 9bfe5ea..df84e07 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -12,12 +12,13 @@ use self::unicode_segmentation::UnicodeSegmentation; pub struct Stems<'a> { words: Peekable>, stemmer: Stemmer, + word_position: usize, } #[derive(Debug, PartialEq)] pub struct StemmedWord { // Where the stemmed word starts - pub stemmed_offset: usize, + pub word_pos: usize, // Where the suffix starts pub suffix_offset: usize, // The stemmed word @@ -33,6 +34,7 @@ impl<'a> Stems<'a> { Stems{ words: text.split_word_bound_indices().peekable(), stemmer: Stemmer::new("english").unwrap(), + word_position: 0, } } @@ -50,14 +52,12 @@ impl<'a> Iterator for Stems<'a> { fn next(&mut self) -> Option { let mut word_to_stem = String::new(); - let mut stemmed_offset = 0; let mut normalized = String::new(); loop { match self.words.peek() { - Some(&(pos, word)) => { + Some(&(_pos, word)) => { normalized = word.nfkc().collect::(); if word.chars().next().unwrap().is_alphabetic() { - stemmed_offset = pos; break; } else { word_to_stem.push_str(&normalized); @@ -66,7 +66,22 @@ impl<'a> Iterator for Stems<'a> { }, None => { if word_to_stem.is_empty() { - return None; + if self.word_position == 0 { + self.word_position = 1; + // in this case we were passed an empty string + // so we don't just return None, but we return + // an empty string Stemmed word. + // otherwise searching fields with empty strings + // wouldn't be possible. + return Some(StemmedWord { + word_pos: 0, + suffix_offset: 0, + stemmed: String::new(), + suffix: String::new(), + }); + } else { + return None; + } } else { break; } @@ -77,8 +92,10 @@ impl<'a> Iterator for Stems<'a> { if !word_to_stem.is_empty() { // we found the begining of the string is not a stemmable word. // Return the accumulated string as the stemmed word + debug_assert!(self.word_position == 0); + self.word_position += 1; return Some(StemmedWord { - stemmed_offset: 0, + word_pos: 0, suffix_offset: word_to_stem.len(), stemmed: word_to_stem, suffix: String::new(), @@ -89,8 +106,7 @@ impl<'a> Iterator for Stems<'a> { word_to_stem = normalized; let mut suffix = word_to_stem.clone(); loop { - // loop through all non-alphabetic chars and add to suffix (which means the suffix starts - // before the stemmed word) + // loop through all non-alphabetic chars and add to suffix match self.words.peek() { Some(&(_pos, word)) => { normalized = word.nfkc().collect::(); @@ -106,12 +122,14 @@ impl<'a> Iterator for Stems<'a> { } let stemmed = self.stemmer.stem(&word_to_stem.to_lowercase()); let prefix_len = Stems::common_prefix_len(&stemmed, &suffix); - Some(StemmedWord { - stemmed_offset: stemmed_offset, - suffix_offset: stemmed_offset + prefix_len, - stemmed: stemmed, - suffix: (&suffix[prefix_len..]).to_string(), - }) + let ret = StemmedWord { + word_pos: self.word_position, + suffix_offset: prefix_len, + stemmed: stemmed, + suffix: (&suffix[prefix_len..]).to_string(), + }; + self.word_position += 1; + Some(ret) } } @@ -125,20 +143,20 @@ mod tests { let input = "THEse Words deeplY test smOOthly that stemmING"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 0, + StemmedWord { word_pos: 0, suffix_offset: 0, stemmed: String::from("these"), suffix: String::from("THEse ") }, - StemmedWord { stemmed_offset: 6, suffix_offset: 6, + StemmedWord { word_pos: 6, suffix_offset: 6, stemmed: String::from("word"), suffix: String::from("Words ") }, // "deeply" stems to "deepli" - StemmedWord { stemmed_offset: 12, suffix_offset: 17, + StemmedWord { word_pos: 12, suffix_offset: 17, stemmed: String::from("deepli"), suffix: String::from("Y ") }, - StemmedWord { stemmed_offset: 19, suffix_offset: 23, + StemmedWord { word_pos: 19, suffix_offset: 23, stemmed: String::from("test"), suffix: String::from(" ") }, - StemmedWord { stemmed_offset: 24, suffix_offset: 26, + StemmedWord { word_pos: 24, suffix_offset: 26, stemmed: String::from("smooth"), suffix: String::from("OOthly ") }, - StemmedWord { stemmed_offset: 33, suffix_offset: 37, + StemmedWord { word_pos: 33, suffix_offset: 37, stemmed: String::from("that"), suffix: String::from(" ") }, - StemmedWord { stemmed_offset: 38, suffix_offset: 42, + StemmedWord { word_pos: 38, suffix_offset: 42, stemmed: String::from("stem"), suffix: String::from("mING") }, ]; assert_eq!(result.len(), expected.len()); @@ -152,7 +170,7 @@ mod tests { let input = " @#$!== \t+-"; let result = Stems::new(input).collect::>(); assert_eq!(result, vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 12, + StemmedWord { word_pos: 0, suffix_offset: 12, stemmed: String::from(" @#$!== \t+-"), suffix: String::from("") }, ]); } @@ -162,11 +180,11 @@ mod tests { let input = "@!? Let's seeing..."; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 6, + StemmedWord { word_pos: 0, suffix_offset: 6, stemmed: String::from("@!? "), suffix: String::from("") }, - StemmedWord { stemmed_offset: 6, suffix_offset: 6, + StemmedWord { word_pos: 6, suffix_offset: 6, stemmed: String::from("let"), suffix: String::from("Let's ") }, - StemmedWord { stemmed_offset: 12, suffix_offset: 15, + StemmedWord { word_pos: 12, suffix_offset: 15, stemmed: String::from("see"), suffix: String::from("ing...") }, ]; assert_eq!(result.len(), expected.len()); @@ -180,9 +198,9 @@ mod tests { let input = "Ünicöde stemming"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 0, + StemmedWord { word_pos: 0, suffix_offset: 0, stemmed: String::from("ünicöd"), suffix: String::from("Ünicöde ") }, - StemmedWord { stemmed_offset: 10, suffix_offset: 14, + StemmedWord { word_pos: 10, suffix_offset: 14, stemmed: String::from("stem"), suffix: String::from("ming") }, ]; assert_eq!(result.len(), expected.len()); @@ -196,7 +214,7 @@ mod tests { let input = "İ"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 0, + StemmedWord { word_pos: 0, suffix_offset: 0, stemmed: String::from("i̇"), suffix: String::from("İ") }, ]; assert_eq!(result.len(), expected.len()); @@ -231,7 +249,7 @@ mod tests { let input = "\u{03A1}\u{0313}\u{03C1}\u{0313}\u{1FE4}"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { stemmed_offset: 0, suffix_offset: 0, + StemmedWord { word_pos: 0, suffix_offset: 0, stemmed: String::from("\u{03C1}\u{0313}\u{1FE4}\u{1FE4}"), suffix: String::from("\u{03A1}\u{0313}\u{1FE4}\u{1FE4}") }, ]; From 3dcb411cac74f98687e3fc91c0e934b3d9c97132 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 19 Dec 2016 16:15:56 -0800 Subject: [PATCH 045/122] Move QueryRuntimeFilters into new file filters.rs --- src/filters.rs | 584 ++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/query.rs | 590 +------------------------------------------------ 3 files changed, 589 insertions(+), 586 deletions(-) create mode 100644 src/filters.rs diff --git a/src/filters.rs b/src/filters.rs new file mode 100644 index 0000000..070c613 --- /dev/null +++ b/src/filters.rs @@ -0,0 +1,584 @@ +extern crate capnp; + +use std::str; +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::collections::HashSet; + +use error::Error; +use key_builder::KeyBuilder; +use stems::StemmedWord; +use query::DocResult; + +// TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs +use rocksdb::{self, DBIterator, IteratorMode}; +use records_capnp::payload; + + +pub trait QueryRuntimeFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error>; + fn next_result(&mut self) -> Result, Error>; +} + +pub struct ExactMatchFilter { + iter: DBIterator, + keypathword: String, + word_pos: u64, + suffix: String, + suffix_offset: u64, +} + +impl ExactMatchFilter { + pub fn new(iter: DBIterator, stemmed_word: &StemmedWord, kb: &KeyBuilder) -> ExactMatchFilter { + ExactMatchFilter{ + iter: iter, + keypathword: kb.get_keypathword_only(&stemmed_word.stemmed), + word_pos: stemmed_word.word_pos as u64, + suffix: stemmed_word.suffix.clone(), + suffix_offset: stemmed_word.suffix_offset as u64, + } + } +} + +impl QueryRuntimeFilter for ExactMatchFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), + rocksdb::Direction::Forward)); + + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + + self.next_result() + } + + fn next_result(&mut self) -> Result, Error> { + loop { + if !self.iter.valid() { + return Ok(None) + } + + let (key, value) = match self.iter.next() { + Some((key, value)) => (key, value), + None => return Ok(None), + }; + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return Ok(None) + } + + // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed + // and why we pass on mutable reference of it to `read_message()` + let mut ref_value = &*value; + let message_reader = ::capnp::serialize_packed::read_message( + &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + let payload = message_reader.get_root::().unwrap(); + + for wi in try!(payload.get_wordinfos()).iter() { + if self.word_pos == wi.get_word_pos() && + self.suffix_offset == wi.get_suffix_offset() && + self.suffix == try!(wi.get_suffix_text()) { + // We have a candidate document to return + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + return Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))); + } + } + } + } +} + +pub struct StemmedWordFilter { + iter: DBIterator, + keypathword: String, +} + +impl StemmedWordFilter { + pub fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordFilter { + StemmedWordFilter { + iter: iter, + keypathword: kb.get_keypathword_only(&stemmed_word), + } + } +} + +impl QueryRuntimeFilter for StemmedWordFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), + rocksdb::Direction::Forward)); + + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + + self.next_result() + } + + fn next_result(&mut self) -> Result, Error> { + if !self.iter.valid() { + return Ok(None) + } + + let key = match self.iter.next() { + Some((key, _value)) => key, + None => return Ok(None), + }; + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return Ok(None) + } + + // We have a candidate document to return + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))) + } +} + +/// This is not a QueryRuntimeFilter but it imitates one. Instead of returning just a DocResult +/// it also return a vector of word positions, each being a instance of the word occurance +pub struct StemmedWordPosFilter { + iter: DBIterator, + keypathword: String, +} + +impl StemmedWordPosFilter { + pub fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordPosFilter { + StemmedWordPosFilter{ + iter: iter, + keypathword: kb.get_keypathword_only(&stemmed_word), + } + } + + fn first_result(&mut self, + start: &DocResult) -> Result)>, Error> { + + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), + rocksdb::Direction::Forward)); + + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + + self.next_result() + } + + fn next_result(&mut self) -> Result)>, Error> { + if !self.iter.valid() { + return Ok(None) + } + + let (key, value) = match self.iter.next() { + Some((key, value)) => (key, value), + None => return Ok(None), + }; + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return Ok(None) + } + let mut ref_value = &*value; + let message_reader = ::capnp::serialize_packed::read_message( + &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + let payload = message_reader.get_root::().unwrap(); + + let positions = try!(payload.get_wordinfos()) + .iter() + .map(|wi| wi.get_word_pos()as i64) + .collect(); + + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let docresult = KeyBuilder::parse_doc_result_from_key(&key_str); + + Ok(Some((docresult, positions))) + } +} + +pub struct StemmedPhraseFilter { + filters: Vec, +} + +impl StemmedPhraseFilter { + pub fn new(filters: Vec) -> StemmedPhraseFilter { + StemmedPhraseFilter { + filters: filters, + } + } + + fn result(&mut self, + base: Option<(DocResult, Vec)>) -> Result, Error> { + // this is the number of matches left before all terms match and we can return a result + let mut matches_left = self.filters.len() - 1; + + if base.is_none() { return Ok(None); } + let (mut base_result, mut base_positions) = base.unwrap(); + + let mut current_filter = 0; + loop { + current_filter += 1; + if current_filter == self.filters.len() { + current_filter = 0; + } + + let next = try!(self.filters[current_filter].first_result(&base_result)); + + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + + if base_result == next_result { + let mut new_positions = Vec::new(); + for &pos in next_positions.iter() { + if let Ok(_) = base_positions.binary_search(&(pos-1)) { + new_positions.push(pos); + } + } + if new_positions.len() > 0 { + // we have valus that survive! reassign back to base_positions + base_positions = new_positions; + matches_left -= 1; + + if matches_left == 0 { + return Ok(Some(base_result)); + } + } else { + // we didn't match on phrase, so get next_result from first filter + current_filter = 0; + let next = try!(self.filters[current_filter].next_result()); + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions; + + matches_left = self.filters.len() - 1; + } + } else { + // we didn't match on next_result, so get first_result at next_result on + // 1st filter. + current_filter = 0; + let next = try!(self.filters[current_filter].first_result(&next_result)); + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions; + + matches_left = self.filters.len() - 1; + } + } + } +} + + +impl QueryRuntimeFilter for StemmedPhraseFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + let base_result = try!(self.filters[0].first_result(start)); + self.result(base_result) + } + + fn next_result(&mut self) -> Result, Error> { + let base_result = try!(self.filters[0].next_result()); + self.result(base_result) + } +} + +pub struct DistanceFilter { + filters: Vec, + current_filter: usize, + distance: i64, +} + +impl DistanceFilter { + pub fn new(filters: Vec, distance: i64) -> DistanceFilter { + DistanceFilter { + filters: filters, + current_filter: 0, + distance: distance, + } + } + + fn result(&mut self, + base: Option<(DocResult, Vec)>) -> Result, Error> { + // yes this code complex. I tried to break it up, but it wants to be like this. + + // this is the number of matches left before all terms match and we can return a result + let mut matches_left = self.filters.len() - 1; + + if base.is_none() { return Ok(None); } + let (mut base_result, positions) = base.unwrap(); + + // This contains tuples of word postions and the filter they came from, + // sorted by word position. + let mut base_positions: Vec<(i64, usize)> = positions.iter() + .map(|pos|(*pos, self.current_filter)) + .collect(); + + // distance is number of words between searched words. + // add one to make calculating difference easier since abs(posa - posb) == distance + 1 + let dis = self.distance + 1; + loop { + self.current_filter += 1; + if self.current_filter == self.filters.len() { + self.current_filter = 0; + } + + let next = try!(self.filters[self.current_filter].first_result(&base_result)); + + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + + if base_result == next_result { + // so we are in the same field. Now to check the proximity of the values from the + // next result to previous results. + + // new_positions_map will accept positions within range of pos. But only if all + // positions that can be are within range. We use the sorted map so we can add + // the same positions multiple times and it's a noop. + let mut new_positions_map = BTreeMap::new(); + for &pos in next_positions.iter() { + // coud these lines be any longer? No they could not. + let start = match base_positions.binary_search_by_key(&(pos-dis), + |&(pos2,_)| pos2) { + Ok(start) => start, + Err(start) => start, + }; + + let end = match base_positions.binary_search_by_key(&(pos+dis), + |&(pos2,_)| pos2) { + Ok(end) => end, + Err(end) => end, + }; + + // we now collect all the filters within the range + let mut filters_encountered = HashSet::new(); + for &(_, filter_n) in base_positions[start..end].iter() { + filters_encountered.insert(filter_n); + } + + if filters_encountered.len() == self.filters.len() - matches_left { + // we encountered all the filters we can at this stage, + // so we should add them all to the new_positions_map + for &(prev_pos, filter_n) in base_positions[start..end].iter() { + new_positions_map.insert(prev_pos, filter_n); + } + // and add the current pos + new_positions_map.insert(pos, self.current_filter); + } + } + if new_positions_map.len() > 0 { + // we have valus that survive! reassign back to positions + base_positions = new_positions_map.into_iter().collect(); + matches_left -= 1; + + if matches_left == 0 { + return Ok(Some(base_result)); + } else { + continue; + } + } + } + // we didn't match on next_result, so get next_result on current filter + let next = try!(self.filters[self.current_filter].next_result()); + + if next.is_none() { return Ok(None); } + let (next_result, next_positions) = next.unwrap(); + base_result = next_result; + base_positions = next_positions.iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); + + matches_left = self.filters.len() - 1; + } + } +} + +impl QueryRuntimeFilter for DistanceFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + let base_result = try!(self.filters[self.current_filter].first_result(start)); + self.result(base_result) + } + + fn next_result(&mut self) -> Result, Error> { + let base_result = try!(self.filters[self.current_filter].next_result()); + self.result(base_result) + } +} + + +pub struct AndFilter<'a> { + filters: Vec>, + current_filter: usize, + array_depth: usize, +} + +impl<'a> AndFilter<'a> { + pub fn new(filters: Vec>, array_depth: usize) -> AndFilter<'a> { + AndFilter { + filters: filters, + current_filter: 0, + array_depth: array_depth, + } + } + + fn result(&mut self, base: Option) -> Result, Error> { + let mut matches_count = self.filters.len() - 1; + + if base.is_none() { return Ok(None); } + let mut base_result = base.unwrap(); + + base_result.arraypath.resize(self.array_depth, 0); + + loop { + self.current_filter += 1; + if self.current_filter == self.filters.len() { + self.current_filter = 0; + } + + let next = try!(self.filters[self.current_filter].first_result(&base_result)); + + if next.is_none() { return Ok(None); } + let mut next_result = next.unwrap(); + + next_result.arraypath.resize(self.array_depth, 0); + + if base_result == next_result { + matches_count -= 1; + if matches_count == 0 { + return Ok(Some(base_result)); + } + } else { + base_result = next_result; + matches_count = self.filters.len() - 1; + } + } + } +} + +impl<'a> QueryRuntimeFilter for AndFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + let base_result = try!(self.filters[self.current_filter].first_result(start)); + self.result(base_result) + } + + fn next_result(&mut self) -> Result, Error> { + let base_result = try!(self.filters[self.current_filter].next_result()); + self.result(base_result) + } +} + +/// Used by OrFilter to maintain a already fetched result so we don't refetch when one side isn't +/// returned to caller. Because we won't know which side gets returned until both sides are +/// fetched. +pub struct FilterWithResult<'a> { + filter: Box, + result: Option, + is_done: bool, + array_depth: usize, +} + +impl<'a> FilterWithResult<'a> { + fn prime_first_result(&mut self, start: &DocResult) -> Result<(), Error> { + if self.is_done { + return Ok(()) + } + if self.result.is_none() { + self.result = try!(self.filter.first_result(start)); + } else if self.result.as_ref().unwrap() < start { + self.result = try!(self.filter.first_result(start)); + } + if self.result.is_none() { + self.is_done = true; + } else { + self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); + } + Ok(()) + } + + fn prime_next_result(&mut self) -> Result<(), Error> { + if self.is_done { + return Ok(()) + } + if self.result.is_none() { + self.result = try!(self.filter.next_result()); + } + if self.result.is_none() { + self.is_done = true; + } else { + self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); + } + Ok(()) + } +} + +pub struct OrFilter<'a> { + left: FilterWithResult<'a>, + right: FilterWithResult<'a>, +} + +impl<'a> OrFilter<'a> { + pub fn new(left: Box, + right: Box, + array_depth: usize) -> OrFilter<'a> { + OrFilter { + left: FilterWithResult{filter: left, + result: None, + array_depth: array_depth, + is_done: false, + }, + + right: FilterWithResult{filter: right, + result: None, + array_depth: array_depth, + is_done: false, + } + } + } + fn take_smallest(&mut self) -> Option { + if let Some(left) = self.left.result.take() { + // left exists + if let Some(right) = self.right.result.take() { + // both exist, return smallest + match left.cmp(&right) { + Ordering::Less => { + // left is smallest, return and put back right + self.right.result = Some(right); + Some(left) + }, + Ordering::Greater => { + // right is smallest, return and put back left + self.left.result = Some(left); + Some(right) + }, + Ordering::Equal => { + // return one and discard the other so we don't return + // identical result in a subsequent call + Some(left) + }, + } + } else { + // right doesn't exist. return left + Some(left) + } + } else { + // left doesn't exist + if self.right.result.is_some() { + // right exists. return it + self.right.result.take() + } else { + // neither exists. return none + None + } + } + } +} + +impl<'a> QueryRuntimeFilter for OrFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + try!(self.left.prime_first_result(start)); + try!(self.right.prime_first_result(start)); + Ok(self.take_smallest()) + } + + fn next_result(&mut self) -> Result, Error> { + try!(self.left.prime_next_result()); + try!(self.right.prime_next_result()); + Ok(self.take_smallest()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 96d690a..80dd5d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ extern crate capnp; extern crate rocksdb; mod error; +mod filters; mod json_shred; mod key_builder; mod stems; diff --git a/src/query.rs b/src/query.rs index 3dc5098..c562554 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,22 +1,19 @@ -#![allow(dead_code)] -#![allow(unused_variables)] extern crate capnp; use std::str; use std::cmp::Ordering; -use std::collections::BTreeMap; -use std::collections::HashSet; use error::Error; use index::Index; use key_builder::KeyBuilder; -use stems::{StemmedWord, Stems}; +use stems::Stems; +use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, + StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{self, DBIterator, IteratorMode, Snapshot}; -use records_capnp::payload; +use rocksdb::{IteratorMode, Snapshot}; #[derive(PartialEq, Eq, PartialOrd, Clone)] @@ -43,15 +40,7 @@ impl Ord for DocResult { } } } -/* -impl Clone for DocResult { - fn clone(&self) -> DocResult { *self } -}*/ -pub trait QueryRuntimeFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error>; - fn next_result(&mut self) -> Result, Error>; -} pub struct Query {} @@ -98,570 +87,6 @@ impl<'a> QueryResults<'a> { } -struct ExactMatchFilter { - iter: DBIterator, - keypathword: String, - word_pos: u64, - suffix: String, - suffix_offset: u64, -} - -impl ExactMatchFilter { - fn new(iter: DBIterator, stemmed_word: &StemmedWord, kb: &KeyBuilder) -> ExactMatchFilter { - ExactMatchFilter{ - iter: iter, - keypathword: kb.get_keypathword_only(&stemmed_word.stemmed), - word_pos: stemmed_word.word_pos as u64, - suffix: stemmed_word.suffix.clone(), - suffix_offset: stemmed_word.suffix_offset as u64, - } - } -} - -impl QueryRuntimeFilter for ExactMatchFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); - // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); - - KeyBuilder::truncate_to_keypathword(&mut self.keypathword); - - self.next_result() - } - - fn next_result(&mut self) -> Result, Error> { - loop { - if !self.iter.valid() { - return Ok(None) - } - - let (key, value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return Ok(None), - }; - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return Ok(None) - } - - // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed - // and why we pass on mutable reference of it to `read_message()` - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - for wi in try!(payload.get_wordinfos()).iter() { - if self.word_pos == wi.get_word_pos() && - self.suffix_offset == wi.get_suffix_offset() && - self.suffix == try!(wi.get_suffix_text()) { - // We have a candidate document to return - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - return Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))); - } - } - } - } -} - -struct StemmedWordFilter { - iter: DBIterator, - keypathword: String, -} - -impl StemmedWordFilter { - fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordFilter { - StemmedWordFilter { - iter: iter, - keypathword: kb.get_keypathword_only(&stemmed_word), - } - } -} - -impl QueryRuntimeFilter for StemmedWordFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); - // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); - - KeyBuilder::truncate_to_keypathword(&mut self.keypathword); - - self.next_result() - } - - fn next_result(&mut self) -> Result, Error> { - if !self.iter.valid() { - return Ok(None) - } - - let (key, value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return Ok(None), - }; - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return Ok(None) - } - - // We have a candidate document to return - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))) - } -} - -/// This is not a QueryRuntimeFilter but it imitates one. Instead of returning just a DocResult -/// it also return a vector of word positions, each being a instance of the word occurance -struct StemmedWordPosFilter { - iter: DBIterator, - keypathword: String, -} - -impl StemmedWordPosFilter { - fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordPosFilter { - StemmedWordPosFilter{ - iter: iter, - keypathword: kb.get_keypathword_only(&stemmed_word), - } - } - - fn first_result(&mut self, - start: &DocResult) -> Result)>, Error> { - - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); - // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); - - KeyBuilder::truncate_to_keypathword(&mut self.keypathword); - - self.next_result() - } - - fn next_result(&mut self) -> Result)>, Error> { - if !self.iter.valid() { - return Ok(None) - } - - let (key, value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return Ok(None), - }; - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return Ok(None) - } - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - let positions = try!(payload.get_wordinfos()) - .iter() - .map(|wi| wi.get_word_pos()as i64) - .collect(); - - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let docresult = KeyBuilder::parse_doc_result_from_key(&key_str); - - Ok(Some((docresult, positions))) - } -} - -struct StemmedPhraseFilter { - filters: Vec, -} - -impl StemmedPhraseFilter { - fn new(filters: Vec) -> StemmedPhraseFilter { - StemmedPhraseFilter { - filters: filters, - } - } - - fn result(&mut self, - base: Option<(DocResult, Vec)>) -> Result, Error> { - // this is the number of matches left before all terms match and we can return a result - let mut matches_left = self.filters.len() - 1; - - if base.is_none() { return Ok(None); } - let (mut base_result, mut base_positions) = base.unwrap(); - - let mut current_filter = 0; - loop { - current_filter += 1; - if current_filter == self.filters.len() { - current_filter = 0; - } - - let next = try!(self.filters[current_filter].first_result(&base_result)); - - if next.is_none() { return Ok(None); } - let (next_result, next_positions) = next.unwrap(); - - if base_result == next_result { - let mut new_positions = Vec::new(); - for &pos in next_positions.iter() { - if let Ok(_) = base_positions.binary_search(&(pos-1)) { - new_positions.push(pos); - } - } - if new_positions.len() > 0 { - // we have valus that survive! reassign back to base_positions - base_positions = new_positions; - matches_left -= 1; - - if matches_left == 0 { - return Ok(Some(base_result)); - } - } else { - // we didn't match on phrase, so get next_result from first filter - current_filter = 0; - let next = try!(self.filters[current_filter].next_result()); - if next.is_none() { return Ok(None); } - let (next_result, next_positions) = next.unwrap(); - base_result = next_result; - base_positions = next_positions; - - matches_left = self.filters.len() - 1; - } - } else { - // we didn't match on next_result, so get first_result at next_result on - // 1st filter. - current_filter = 0; - let next = try!(self.filters[current_filter].first_result(&next_result)); - if next.is_none() { return Ok(None); } - let (next_result, next_positions) = next.unwrap(); - base_result = next_result; - base_positions = next_positions; - - matches_left = self.filters.len() - 1; - } - } - } -} - - -impl QueryRuntimeFilter for StemmedPhraseFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - let base_result = try!(self.filters[0].first_result(start)); - self.result(base_result) - } - - fn next_result(&mut self) -> Result, Error> { - let base_result = try!(self.filters[0].next_result()); - self.result(base_result) - } -} - -struct DistanceFilter { - filters: Vec, - current_filter: usize, - distance: i64, -} - -impl DistanceFilter { - fn new(filters: Vec, distance: i64) -> DistanceFilter { - DistanceFilter { - filters: filters, - current_filter: 0, - distance: distance, - } - } - - fn result(&mut self, - base: Option<(DocResult, Vec)>) -> Result, Error> { - // yes this code complex. I tried to break it up, but it wants to be like this. - - // this is the number of matches left before all terms match and we can return a result - let mut matches_left = self.filters.len() - 1; - - if base.is_none() { return Ok(None); } - let (mut base_result, positions) = base.unwrap(); - - // This contains tuples of word postions and the filter they came from, - // sorted by word position. - let mut base_positions: Vec<(i64, usize)> = positions.iter() - .map(|pos|(*pos, self.current_filter)) - .collect(); - - // distance is number of words between searched words. - // add one to make calculating difference easier since abs(posa - posb) == distance + 1 - let dis = self.distance + 1; - loop { - self.current_filter += 1; - if self.current_filter == self.filters.len() { - self.current_filter = 0; - } - - let next = try!(self.filters[self.current_filter].first_result(&base_result)); - - if next.is_none() { return Ok(None); } - let (next_result, next_positions) = next.unwrap(); - - if base_result == next_result { - // so we are in the same field. Now to check the proximity of the values from the - // next result to previous results. - - // new_positions_map will accept positions within range of pos. But only if all - // positions that can be are within range. We use the sorted map so we can add - // the same positions multiple times and it's a noop. - let mut new_positions_map = BTreeMap::new(); - for &pos in next_positions.iter() { - // coud these lines be any longer? No they could not. - let start = match base_positions.binary_search_by_key(&(pos-dis), - |&(pos2,_)| pos2) { - Ok(start) => start, - Err(start) => start, - }; - - let end = match base_positions.binary_search_by_key(&(pos+dis), - |&(pos2,_)| pos2) { - Ok(end) => end, - Err(end) => end, - }; - - // we now collect all the filters within the range - let mut filters_encountered = HashSet::new(); - for &(_, filter_n) in base_positions[start..end].iter() { - filters_encountered.insert(filter_n); - } - - if filters_encountered.len() == self.filters.len() - matches_left { - // we encountered all the filters we can at this stage, - // so we should add them all to the new_positions_map - for &(prev_pos, filter_n) in base_positions[start..end].iter() { - new_positions_map.insert(prev_pos, filter_n); - } - // and add the current pos - new_positions_map.insert(pos, self.current_filter); - } - } - if new_positions_map.len() > 0 { - // we have valus that survive! reassign back to positions - base_positions = new_positions_map.into_iter().collect(); - matches_left -= 1; - - if matches_left == 0 { - return Ok(Some(base_result)); - } else { - continue; - } - } - } - // we didn't match on next_result, so get next_result on current filter - let next = try!(self.filters[self.current_filter].next_result()); - - if next.is_none() { return Ok(None); } - let (next_result, next_positions) = next.unwrap(); - base_result = next_result; - base_positions = next_positions.iter() - .map(|pos| (*pos, self.current_filter)) - .collect(); - - matches_left = self.filters.len() - 1; - } - } -} - -impl QueryRuntimeFilter for DistanceFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].first_result(start)); - self.result(base_result) - } - - fn next_result(&mut self) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].next_result()); - self.result(base_result) - } -} - - -struct AndFilter<'a> { - filters: Vec>, - current_filter: usize, - array_depth: usize, -} - -impl<'a> AndFilter<'a> { - fn new(filters: Vec>, array_depth: usize) -> AndFilter<'a> { - AndFilter { - filters: filters, - current_filter: 0, - array_depth: array_depth, - } - } - - fn result(&mut self, base: Option) -> Result, Error> { - let mut matches_count = self.filters.len() - 1; - - if base.is_none() { return Ok(None); } - let mut base_result = base.unwrap(); - - base_result.arraypath.resize(self.array_depth, 0); - - loop { - self.current_filter += 1; - if self.current_filter == self.filters.len() { - self.current_filter = 0; - } - - let next = try!(self.filters[self.current_filter].first_result(&base_result)); - - if next.is_none() { return Ok(None); } - let mut next_result = next.unwrap(); - - next_result.arraypath.resize(self.array_depth, 0); - - if base_result == next_result { - matches_count -= 1; - if matches_count == 0 { - return Ok(Some(base_result)); - } - } else { - base_result = next_result; - matches_count = self.filters.len() - 1; - } - } - } -} - -impl<'a> QueryRuntimeFilter for AndFilter<'a> { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].first_result(start)); - self.result(base_result) - } - - fn next_result(&mut self) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].next_result()); - self.result(base_result) - } -} - -/// Used by OrFilter to maintain a already fetched result so we don't refetch when one side isn't -/// returned to caller. Because we won't know which side gets returned until both sides are -/// fetched. -struct FilterWithResult<'a> { - filter: Box, - result: Option, - is_done: bool, - array_depth: usize, -} - -impl<'a> FilterWithResult<'a> { - fn prime_first_result(&mut self, start: &DocResult) -> Result<(), Error> { - if self.is_done { - return Ok(()) - } - if self.result.is_none() { - self.result = try!(self.filter.first_result(start)); - } else if self.result.as_ref().unwrap() < start { - self.result = try!(self.filter.first_result(start)); - } - if self.result.is_none() { - self.is_done = true; - } else { - self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); - } - Ok(()) - } - - fn prime_next_result(&mut self) -> Result<(), Error> { - if self.is_done { - return Ok(()) - } - if self.result.is_none() { - self.result = try!(self.filter.next_result()); - } - if self.result.is_none() { - self.is_done = true; - } else { - self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); - } - Ok(()) - } -} - -struct OrFilter<'a> { - left: FilterWithResult<'a>, - right: FilterWithResult<'a>, -} - -impl<'a> OrFilter<'a> { - fn new(left: Box, - right: Box, - array_depth: usize) -> OrFilter<'a> { - OrFilter { - left: FilterWithResult{filter: left, - result: None, - array_depth: array_depth, - is_done: false, - }, - - right: FilterWithResult{filter: right, - result: None, - array_depth: array_depth, - is_done: false, - } - } - } - fn take_smallest(&mut self) -> Option { - if let Some(left) = self.left.result.take() { - // left exists - if let Some(right) = self.right.result.take() { - // both exist, return smallest - match left.cmp(&right) { - Ordering::Less => { - // left is smallest, return and put back right - self.right.result = Some(right); - Some(left) - }, - Ordering::Greater => { - // right is smallest, return and put back left - self.left.result = Some(left); - Some(right) - }, - Ordering::Equal => { - // return one and discard the other so we don't return - // identical result in a subsequent call - Some(left) - }, - } - } else { - // right doesn't exist. return left - Some(left) - } - } else { - // left doesn't exist - if self.right.result.is_some() { - // right exists. return it - self.right.result.take() - } else { - // neither exists. return none - None - } - } - } -} - -impl<'a> QueryRuntimeFilter for OrFilter<'a> { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - try!(self.left.prime_first_result(start)); - try!(self.right.prime_first_result(start)); - Ok(self.take_smallest()) - } - - fn next_result(&mut self) -> Result, Error> { - try!(self.left.prime_next_result()); - try!(self.right.prime_next_result()); - Ok(self.take_smallest()) - } -} - - struct Parser<'a> { @@ -1160,13 +585,6 @@ mod tests { query_results = Query::get_matches(r#"find {A:[{B: == "B2", C: == "C8"}]}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), None); - let (mut x, mut y) = (1, 2); - x = x + 1; - y = y + 1; - let (x, v) = (x+1, y+1); - - assert_eq!(x, 3); - query_results = Query::get_matches(r#"find {A:[{B: == "b1", C: == "C2"}]}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); From 776c1ed06654ee86cfe99fa5f83359ad3c3bbb07 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 23 Dec 2016 22:52:28 -0800 Subject: [PATCH 046/122] Add return clause to extract data and format from documents --- src/error.rs | 16 ++ src/index.rs | 9 +- src/json_shred.rs | 15 +- src/key_builder.rs | 84 ++++++- src/query.rs | 601 ++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 677 insertions(+), 48 deletions(-) diff --git a/src/error.rs b/src/error.rs index 02783a2..55f286c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,6 +3,8 @@ extern crate rocksdb; use std::{error, fmt}; use std::num::ParseIntError; +use std::io; + #[derive(Debug)] pub enum Error { @@ -10,6 +12,8 @@ pub enum Error { Shred(String), Capnp(capnp::Error), Rocks(rocksdb::Error), + Write(String), + Io(io::Error), } impl error::Error for Error { @@ -22,6 +26,8 @@ impl error::Error for Error { // that it has the std::error:Error implemented and hence // and err.description() Error::Rocks(_) => "This is an rocksdb error", + Error::Write(ref description) => description, + Error::Io(ref err) => err.description(), } } @@ -33,6 +39,8 @@ impl error::Error for Error { // NOTE vmx 2016-11-07: Looks like the RocksDB Wrapper needs to be // patched to be based on the std::error::Error trait Error::Rocks(_) => None, + Error::Write(_) => None, + Error::Io(ref err) => Some(err as &error::Error), } } } @@ -55,6 +63,12 @@ impl From for Error { } } +impl From for Error { + fn from(err: io::Error) -> Error { + Error::Io(err) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { @@ -62,6 +76,8 @@ impl fmt::Display for Error { Error::Shred(ref err) => write!(f, "Shred error: {}", err), Error::Capnp(ref err) => write!(f, "Capnproto error: {}", err), Error::Rocks(ref err) => write!(f, "RocksDB error: {}", err), + Error::Write(ref err) => write!(f, "Write error: {}", err), + Error::Io(ref err) => write!(f, "Io error: {}", err), } } } diff --git a/src/index.rs b/src/index.rs index dbb7046..db7fbf2 100644 --- a/src/index.rs +++ b/src/index.rs @@ -113,8 +113,12 @@ impl Index { { let docid = try!(shredder.shred(json, self.high_doc_seq + 1, self.batch.as_mut().unwrap())); + if self.id_str_to_id_seq.contains_key(&docid) { + return Err(Error::Write("Attempt to insert multiple docs with same _id" + .to_string())); + } self.high_doc_seq += 1; - self.id_str_to_id_seq.insert(format!("I{}", docid), format!("S{}", self.high_doc_seq)); + self.id_str_to_id_seq.insert(format!("I{}", docid), format!("{}", self.high_doc_seq)); } Ok(()) } @@ -137,10 +141,9 @@ impl Index { } } - // Add the ids_to_seq keyspace entries + // Add the ids_to_seq keyspace entry for (id, seq) in &self.id_str_to_id_seq { try!(self.batch.as_mut().unwrap().put(id.as_bytes(), seq.as_bytes())); - try!(self.batch.as_mut().unwrap().put(seq.as_bytes(), id.as_bytes())); } let mut header = Header::new(); diff --git a/src/json_shred.rs b/src/json_shred.rs index dee9eac..2a51745 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -97,7 +97,12 @@ impl Shredder { } let key = self.kb.value_key(docseq); - try!(batch.put(&key.into_bytes(), &text.as_bytes())); + + let mut buffer = String::with_capacity(text.len() + 1); + buffer.push('s'); + buffer.push_str(&text); + + try!(batch.put(&key.into_bytes(), &buffer.as_bytes())); Ok(()) } @@ -192,7 +197,13 @@ impl Shredder { // No children to ignore if self.ignore_children == 0 { match self.extract_key(parser.stack().top()) { - ObjectKeyTypes::Id => self.doc_id = value, + ObjectKeyTypes::Id => { + self.doc_id = value.clone(); + self.kb.pop_object_key(); + self.kb.push_object_key("_id"); + + try!(self.add_entries(&value, docseq, batch)); + }, ObjectKeyTypes::Key(key) => { // Pop the dummy object that makes ObjectEnd happy // or the previous object key diff --git a/src/key_builder.rs b/src/key_builder.rs index b8bde89..d5c5137 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -1,6 +1,10 @@ use query::DocResult; use std::str; +pub enum Segment { + ObjectKey(String), + Array(u64), +} #[derive(Debug, Clone)] pub struct KeyBuilder { @@ -56,13 +60,16 @@ impl KeyBuilder { pub fn value_key(&self, seq: u64) -> String { let mut string = String::with_capacity(100); string.push('V'); + string.push_str(&seq.to_string()); + string.push('#'); + let mut i = 0; for segment in &self.keypath { string.push_str(&segment); + if segment == "$" { + string.push_str(&self.arraypath[i].to_string()); + i += 1; + } } - string.push('#'); - string.push_str(&seq.to_string()); - - KeyBuilder::add_arraypath(&mut string, &self.arraypath); string } @@ -75,15 +82,64 @@ impl KeyBuilder { string.push_str(i.to_string().as_str()); } } - } + // returns the unescaped segment as Segment and the escaped segment as a slice + pub fn parse_first_key_value_segment(keypath: &str) -> Option<(Segment, String)> { + + let mut unescaped = String::with_capacity(50); + let mut len_bytes = 1; + let mut chars = keypath.chars(); + + // first char must be a . or a $ or we've exceeded the keypath + match chars.next() { + Some('.') => { + loop { + match chars.next() { + Some('\\') => { + if let Some(c) = chars.next() { + len_bytes += c.len_utf8(); + unescaped.push(c); + } else { + panic!("Escape char found as last char in keypath"); + } + }, + Some('.') | Some('$') => { + break; + }, + Some(c) => { + len_bytes += c.len_utf8(); + unescaped.push(c); + } + None => { + break; + } + } + } + Some((Segment::ObjectKey(unescaped), keypath[..len_bytes].to_string())) + }, + Some('$') => { + let mut i = String::new(); + for c in chars { + if c >= '0' && c <= '9' { + i.push(c); + } else { + break; + } + } + Some((Segment::Array(i.parse().unwrap()), keypath[..1+i.len()].to_string())) + }, + Some(_) => None, // we must be past the keypath portion of string. done. + None => None, + } + } + pub fn push_object_key(&mut self, key: &str) { let mut escaped_key = String::with_capacity((key.len() * 2) + 1); // max expansion escaped_key.push('.'); for cc in key.chars() { // Escape chars that conflict with delimiters - if "\\$.!#,".contains(cc) { + if "\\$.!#".contains(cc) { escaped_key.push('\\'); } escaped_key.push(cc); @@ -96,10 +152,24 @@ impl KeyBuilder { self.arraypath.push(0); } + pub fn push_array_index(&mut self, index: u64) { + self.keypath.push("$".to_string()); + self.arraypath.push(index); + } + pub fn pop_object_key(&mut self) { debug_assert!(self.keypath.last().unwrap().starts_with(".")); self.keypath.pop(); } + pub fn peek_object_key(&self) -> String { + debug_assert!(self.keypath.last().unwrap().starts_with(".")); + let x = KeyBuilder::parse_first_key_value_segment(&self.keypath.last().unwrap()); + if let Some((Segment::ObjectKey(key), _unescaped)) = x { + key + } else { + panic!("peek_object_key is messed up yo!"); + } + } pub fn pop_array(&mut self) { debug_assert!(self.keypath.last().unwrap() == "$"); @@ -141,7 +211,7 @@ impl KeyBuilder { pub fn parse_doc_result_from_key(str: &str) -> DocResult { let mut dr = DocResult::new(); let (_path_str, seq_str, arraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&str); - dr.seq = seq_str.parse().unwrap(); + dr.seq = seq_str.parse().unwrap(); if !arraypath_str.is_empty() { for numstr in arraypath_str.split(",") { dr.arraypath.push(numstr.parse().unwrap()); diff --git a/src/query.rs b/src/query.rs index c562554..236cfd8 100644 --- a/src/query.rs +++ b/src/query.rs @@ -3,17 +3,23 @@ extern crate capnp; use std::str; use std::cmp::Ordering; +use std::io::Write; +use std::collections::HashMap; +use std::iter::Peekable; +use std::mem::transmute; +use std::collections::VecDeque; +use std::iter::Iterator; use error::Error; use index::Index; -use key_builder::KeyBuilder; +use key_builder::{KeyBuilder, Segment}; use stems::Stems; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{IteratorMode, Snapshot}; +use rocksdb::{self, DBIterator, IteratorMode, Snapshot}; #[derive(PartialEq, Eq, PartialOrd, Clone)] @@ -48,14 +54,23 @@ pub struct QueryResults<'a> { filter: Box, doc_result_next: DocResult, snapshot: Snapshot<'a>, + iter: DBIterator, + returnable: Box, + buffer: Vec, } impl<'a> QueryResults<'a> { - fn new(filter: Box, snapshot: Snapshot<'a>) -> QueryResults<'a> { + fn new(filter: Box, + snapshot: Snapshot<'a>, + returnable: Box) -> QueryResults<'a> { + let iter = snapshot.iterator(IteratorMode::Start); QueryResults{ filter: filter, doc_result_next: DocResult::new(), snapshot: snapshot, + iter: iter, + returnable: returnable, + buffer: Vec::new(), } } @@ -74,9 +89,9 @@ impl<'a> QueryResults<'a> { let seq = try!(self.get_next()); match seq { Some(seq) => { - let key = format!("S{}", seq); + let key = format!("V{}#._id", seq); match try!(self.snapshot.get(&key.as_bytes())) { - // If there is an id, it's UTF-8. Strip off keyspace leading byte + // If there is an id, it's UTF-8. Strip off type leading byte Some(id) => Ok(Some(id.to_utf8().unwrap()[1..].to_string())), None => Ok(None) } @@ -84,10 +99,19 @@ impl<'a> QueryResults<'a> { None => Ok(None), } } -} - - + pub fn next_result(&mut self) -> Result, Error> { + let seq = match try!(self.get_next()) { + Some(seq) => seq, + None => return Ok(None), + }; + let bind = HashMap::new(); + let mut results = VecDeque::new(); + try!(self.returnable.fetch_result(&mut self.iter, seq, &bind, &mut results)); + try!(self.returnable.write_result(&mut results, &mut self.buffer)); + Ok(Some(unsafe{str::from_utf8_unchecked(&self.buffer[..])}.to_string())) + } +} struct Parser<'a> { query: String, @@ -132,30 +156,40 @@ impl<'a> Parser<'a> { self.ws(); Ok(()) } else { - Err(Error::Parse(format!("Expected '{}' at character {}.", - token, self.offset))) + Err(Error::Parse(format!("Expected '{}' at character {}, found {}.", + token, self.offset, + &self.query[self.offset..self.offset+1]))) } } - fn could_consume(&mut self, token: &str) -> bool { + fn could_consume(&self, token: &str) -> bool { self.query[self.offset..].starts_with(token) } - fn consume_field(&mut self) -> Option { + fn consume_field(&mut self) -> Result, Error> { let mut result = String::new(); - for char in self.query[self.offset..].chars() { - if char.is_alphanumeric() { - result.push(char); - } else { - break; + { + let mut chars = self.query[self.offset..].chars(); + if let Some(c) = chars.next() { + // first char cannot be numeric + if c.is_alphabetic() || '_' == c || '$' == c { + result.push(c); + for c in chars { + if c.is_alphanumeric() || '_' == c || '$' == c { + result.push(c); + } else { + break; + } + } + } } - } + } if result.len() > 0 { self.offset += result.len(); self.ws(); - Some(result) + Ok(Some(result)) } else { - None + self.consume_string_literal() } } @@ -177,7 +211,59 @@ impl<'a> Parser<'a> { } } - fn consume_string_literal(&mut self) -> Result { + fn consume_keypath(&mut self) -> Result, Error> { + let key: String = if self.consume(".") { + if self.consume("[") { + let key = try!(self.must_consume_string_literal()); + try!(self.must_consume("]")); + key + } else { + if let Some(key) = try!(self.consume_field()) { + key + } else { + self.ws(); + // this means return the whole document + return Ok(Some(KeyBuilder::new())); + } + } + } else { + return Ok(None); + }; + + let mut kb = KeyBuilder::new(); + kb.push_object_key(&key); + loop { + if self.consume("[") { + if let Some(index) = try!(self.consume_integer()) { + kb.push_array_index(index as u64); + } else { + return Err(Error::Parse("Expected array index integer.".to_string())); + } + try!(self.must_consume("]")); + } else if self.consume(".") { + if let Some(key) = try!(self.consume_field()) { + kb.push_object_key(&key); + } else { + return Err(Error::Parse("Expected object key.".to_string())); + } + } else { + break; + } + } + self.ws(); + Ok(Some(kb)) + } + + + fn must_consume_string_literal(&mut self) -> Result { + if let Some(string) = try!(self.consume_string_literal()) { + Ok(string) + } else { + Err(Error::Parse("Expected string literal.".to_string())) + } + } + + fn consume_string_literal(&mut self) -> Result, Error> { let mut lit = String::new(); let mut next_is_special_char = false; if self.could_consume("\"") { @@ -212,11 +298,13 @@ impl<'a> Parser<'a> { } } try!(self.must_consume("\"")); - Ok(lit) + self.ws(); + Ok(Some(lit)) } else { - Err(Error::Parse("Expected \"".to_string())) + Ok(None) } } + /* find @@ -332,7 +420,7 @@ ws1 fn ocompare<'b>(&'b mut self) -> Result, Error> { if let Some(filter) = try!(self.oparens()) { Ok(filter) - } else if let Some(field) = self.consume_field() { + } else if let Some(field) = try!(self.consume_field()) { self.kb.push_object_key(&field); try!(self.must_consume(":")); if let Some(filter) = try!(self.oparens()) { @@ -364,7 +452,7 @@ ws1 fn compare<'b>(&'b mut self) -> Result, Error> { if self.consume("==") { - let literal = try!(self.consume_string_literal()); + let literal = try!(self.must_consume_string_literal()); let stems = Stems::new(&literal); let mut filters: Vec> = Vec::new(); for stem in stems { @@ -380,7 +468,7 @@ ws1 } } else if self.consume("~=") { // regular search - let literal = try!(self.consume_string_literal()); + let literal = try!(self.must_consume_string_literal()); let stems = Stems::new(&literal); let stemmed_words: Vec = stems.map(|stem| stem.stemmed).collect(); @@ -409,7 +497,7 @@ ws1 }; try!(self.must_consume("=")); - let literal = try!(self.consume_string_literal()); + let literal = try!(self.must_consume_string_literal()); let stems = Stems::new(&literal); let mut filters: Vec = Vec::new(); for stem in stems { @@ -476,21 +564,441 @@ ws1 Ok(filter) } + fn return_clause(&mut self) -> Result, Error> { + if self.consume("return") { + if let Some(ret_value) = try!(self.ret_value()) { + Ok(ret_value) + } else { + Err(Error::Parse("Expected key, object or array to return.".to_string())) + } + } else { + let mut kb = KeyBuilder::new(); + kb.push_object_key("_id"); + Ok(Box::new(RetValue{kb:kb})) + } + } - fn build_filter(mut self) -> Result<(Box, Snapshot<'a>), Error> { + fn ret_object(&mut self) -> Result, Error> { + try!(self.must_consume("{")); + let mut fields: Vec<(String, Box)> = Vec::new(); + loop { + if let Some(field) = try!(self.consume_field()) { + try!(self.must_consume(":")); + if let Some(ret_value) = try!(self.ret_value()) { + fields.push((field, ret_value)); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Expected key to return.".to_string())); + } + } else { + break; + } + } + + try!(self.must_consume("}")); + if fields.is_empty() { + return Err(Error::Parse("Found empty object in return.".to_string())); + } + Ok(Box::new(RetObject{fields: fields})) + } + + fn ret_array(&mut self) -> Result, Error> { + try!(self.must_consume("[")); + let mut slots = Vec::new(); + loop { + if let Some(ret_value) = try!(self.ret_value()) { + slots.push(ret_value); + if !self.consume(",") { + break; + } + } else { + break; + } + } + try!(self.must_consume("]")); + if slots.is_empty() { + return Err(Error::Parse("Found empty array in return.".to_string())); + } + Ok(Box::new(RetArray{slots: slots})) + + } + + fn ret_value(&mut self) -> Result>, Error> { + if let Some(kb) = try!(self.consume_keypath()) { + Ok(Some(Box::new(RetValue{kb: kb}))) + } else if self.could_consume("{") { + Ok(Some(try!(self.ret_object()))) + } else if self.could_consume("[") { + Ok(Some(try!(self.ret_array()))) + } else { + Ok(None) + } + } + + fn build_filter(&mut self) -> Result, Error> { self.ws(); - Ok((try!(self.find()), self.snapshot)) + Ok(try!(self.find())) } } + +pub trait Returnable { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + bind_var_keys: &HashMap, + result: &mut VecDeque>) -> Result<(), Error>; + + fn write_result(&self, results: &mut VecDeque>, + write: &mut Write) -> Result<(), Error>; +} + +pub enum JsonValue { + Number(f64), + String(String), + Array(Vec), + Object(Vec<(String, JsonValue)>), + True, + False, + Null, +} + +impl JsonValue { + fn str_to_literal(string: &str) ->String { + let mut ret = String::with_capacity(string.len()*2+2); + ret.push('"'); + for c in string.chars() { + if c == '"' || c == '\\' { + ret.push('\\'); + } + ret.push(c); + } + ret.push('"'); + ret + } + + fn render(&self, write: &mut Write) -> Result<(), Error> { + match self { + &JsonValue::Number(ref num) => try!(write.write_all(num.to_string().as_bytes())), + &JsonValue::String(ref string) => { + try!(write.write_all(JsonValue::str_to_literal(&string).as_bytes())) + }, + &JsonValue::Array(ref array) => { + try!(write.write_all("[".as_bytes())); + + let mut iter = array.iter().peekable(); + loop { + match iter.next() { + Some(json) => try!(json.render(write)), + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + } + try!(write.write_all("]".as_bytes())); + }, + &JsonValue::Object(ref object) => { + try!(write.write_all("{".as_bytes())); + + let mut iter = object.iter().peekable(); + loop { + match iter.next() { + Some(&(ref key, ref json)) => { + try!(write.write_all(JsonValue::str_to_literal(&key).as_bytes())); + try!(write.write_all(":".as_bytes())); + try!(json.render(write)); + } + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + } + try!(write.write_all("}".as_bytes())); + }, + &JsonValue::True => try!(write.write_all("true".as_bytes())), + &JsonValue::False => try!(write.write_all("false".as_bytes())), + &JsonValue::Null => try!(write.write_all("null".as_bytes())), + } + Ok(()) + } +} + +pub struct RetObject { + fields: Vec<(String, Box)>, +} + +impl Returnable for RetObject { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + bind_var_keys: &HashMap, + result: &mut VecDeque>) -> Result<(), Error> { + for &(ref _key, ref field) in self.fields.iter() { + try!(field.fetch_result(iter, seq, bind_var_keys, result)); + } + Ok(()) + } + + fn write_result(&self, results: &mut VecDeque>, + write: &mut Write) -> Result<(), Error> { + try!(write.write_all("{".as_bytes())); + let mut iter = self.fields.iter().peekable(); + loop { + match iter.next() { + Some(&(ref key, ref returnable)) => { + try!(write.write_all(JsonValue::str_to_literal(key).as_bytes())); + try!(write.write_all(":".as_bytes())); + try!(returnable.write_result(results, write)); + }, + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + } + try!(write.write_all("}".as_bytes())); + Ok(()) + } +} + + +pub struct RetArray { + slots: Vec>, +} + +impl Returnable for RetArray { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + bind_var_keys: &HashMap, + result: &mut VecDeque>) -> Result<(), Error> { + for ref mut slot in self.slots.iter() { + try!(slot.fetch_result(iter, seq, bind_var_keys, result)); + } + Ok(()) + } + + fn write_result(&self, results: &mut VecDeque>, + write: &mut Write) -> Result<(), Error> { + + try!(write.write_all("[".as_bytes())); + let mut iter = self.slots.iter().peekable(); + loop { + match iter.next() { + Some(ref returnable) => try!(returnable.write_result(results, write)), + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + } + try!(write.write_all("]".as_bytes())); + Ok(()) + } +} + +pub struct RetValue { + kb: KeyBuilder, +} + +impl RetValue { + fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { + match bytes[0] as char { + 's' => { + let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); + JsonValue::String(string) + }, + 'n' => { + assert!(bytes.len() == 9); + let mut bytes2: [u8; 8] = [0; 8]; + for (n, b) in bytes[1..9].iter().enumerate() { + bytes2[n] = *b; + } + let double: f64 = unsafe{transmute(bytes2)}; + JsonValue::Number(double) + }, + 'T' => JsonValue::True, + 'F' => JsonValue::False, + 'N' => JsonValue::Null, + 'o' => JsonValue::Object(vec![]), + 'a' => JsonValue::Array(vec![]), + what => panic!("unexpected type tag in value: {}", what), + } + } + + fn return_array(mut array: Vec<(u64, JsonValue)>) -> Result { + array.sort_by_key(|tuple| tuple.0); + Ok(JsonValue::Array(array.into_iter() + .map(|(_i, json)| json) + .collect())) + } + + fn fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, + mut key: Box<[u8]>, mut value: Box<[u8]>) -> Result { + + if key.len() == value_key.len() { + // we have a key match! + return Ok(RetValue::bytes_to_json_value(value.as_ref())); + } + let segment = { + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let remaining = &key_str[value_key.len()..]; + KeyBuilder::parse_first_key_value_segment(&remaining) + }; + + match segment { + Some((Segment::ObjectKey(mut unescaped), escaped)) => { + let mut object: Vec<(String, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = try!(RetValue::fetch(iter, &value_key_next, key, value)); + object.push((unescaped, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + if !k.starts_with(value_key.as_bytes()) { + return Ok(JsonValue::Object(object)); + } + + let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + }, + None => return Ok(JsonValue::Object(object)), + }; + + if let Some((Segment::ObjectKey(unescaped2), escaped2)) = segment { + unescaped = unescaped2; + // advance the peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + } + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return Ok(JsonValue::Object(object)); + } + } + } + Some((Segment::Array(mut i), escaped)) => { + // we use a tuple with ordinal because we encounter + // elements in lexical sorting order instead of ordinal order + let mut array: Vec<(u64, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = try!(RetValue::fetch(iter, &value_key_next, + key, value)); + array.push((i, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + if !k.starts_with(value_key.as_bytes()) { + return RetValue::return_array(array); + } + + let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + }, + None => return RetValue::return_array(array), + }; + + if let Some((Segment::Array(i2), escaped2)) = segment { + i = i2; + // advance the already peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + }, + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return RetValue::return_array(array);; + } + } + }, + None => { + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + panic!("somehow couldn't parse key segment {} {}", value_key, key_str); + }, + } + } +} + +impl Returnable for RetValue { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + bind_var_keys: &HashMap, + result: &mut VecDeque>) -> Result<(), Error> { + let value_key = if self.kb.keypath_segments_len() == 1 { + let key = self.kb.peek_object_key(); + if let Some(value_key) = bind_var_keys.get(&key) { + value_key.to_string() + } else { + self.kb.value_key(seq) + } + } else { + self.kb.value_key(seq) + }; + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => { + result.push_back(None); + return Ok(()) + }, + }; + + if !key.starts_with(value_key.as_bytes()) { + result.push_back(None); + return Ok(()); + } + + let json_value = try!(RetValue::fetch(&mut iter.peekable(), &value_key, + key, value)); + result.push_back(Some(json_value)); + Ok(()) + } + + fn write_result(&self, results: &mut VecDeque>, + write: &mut Write) -> Result<(), Error> { + if let Some(option) = results.pop_front() { + if let Some(json) = option { + try!(json.render(write)); + } else { + // for now just output a Null when we found nothing + try!(JsonValue::Null.render(write)); + } + } else { + panic!("missing result!"); + } + Ok(()) + } +} + + + impl Query { pub fn get_matches<'a>(query: String, index: &'a Index) -> Result, Error> { match index.rocks { Some(ref rocks) => { let snapshot = Snapshot::new(&rocks); - let parser = Parser::new(query, snapshot); - let (filter, snapshot2) = try!(parser.build_filter()); - Ok(QueryResults::new(filter, snapshot2)) + let mut parser = Parser::new(query, snapshot); + let filter = try!(parser.build_filter()); + let returnable = try!(parser.return_clause()); + Ok(QueryResults::new(filter, parser.snapshot, returnable)) }, None => { Err(Error::Parse("You must open the index first".to_string())) @@ -527,15 +1035,15 @@ mod tests { } #[test] - fn test_consume_string_literal() { + fn test_must_consume_string_literal() { let mut index = Index::new(); - index.open("target/tests/test_consume_string_literal", Some(OpenOptions::Create)).unwrap(); + index.open("target/tests/test_must_consume_string_literal", Some(OpenOptions::Create)).unwrap(); let rocks = &index.rocks.unwrap(); let snapshot = Snapshot::new(rocks); let query = r#"" \n \t test""#.to_string(); let mut parser = Parser::new(query, snapshot); - assert_eq!(parser.consume_string_literal().unwrap(), " \n \t test".to_string()); + assert_eq!(parser.must_consume_string_literal().unwrap(), " \n \t test".to_string()); } #[test] @@ -571,6 +1079,7 @@ mod tests { let _ = index.add(r#"{"_id":"9", "A":["A1", "A2"]}"#); let _ = index.add(r#"{"_id":"10", "A":"a bunch of words in this sentence"}"#); let _ = index.add(r#"{"_id":"11", "A":""}"#); + let _ = index.add(r#"{"_id":"12", "A":["1","2","3","4","5","6","7","8","9","10","11","12"]}"#); index.flush().unwrap(); @@ -650,10 +1159,30 @@ mod tests { assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); - query_results = Query::get_matches(r#"find {A: == ""}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("11".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[ == "1"]} + return .A "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(), + Some(r#"["1","2","3","4","5","6","7","8","9","10","11","12"]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[ == "2"]} + return .A[0] "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[ == "2"]} + return [.A[0], ._id] "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["1","12"]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[ == "2"]} + return {foo:.A[0], bar: ._id} "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"1","bar":"12"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); } From 68e38f38f834c8b7a8a71aac2e4c99f5e5e2919e Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 24 Dec 2016 00:47:08 -0800 Subject: [PATCH 047/122] Losslessly store all json types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We still can’t query on anything but strings, but we can store and return all datatypes. --- src/json_shred.rs | 92 ++++++++++++++++++++++++++++++++++++++++++++-- src/key_builder.rs | 6 +++ src/query.rs | 7 +++- 3 files changed, 101 insertions(+), 4 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 2a51745..1b96986 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -2,6 +2,9 @@ extern crate rocksdb; extern crate rustc_serialize; use std::collections::HashMap; +use std::mem::transmute; +use std::io::Write; +use std::str::Chars; use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; @@ -106,7 +109,47 @@ impl Shredder { Ok(()) } + + fn add_value(&mut self, code: char, value: &[u8], + docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + let key = self.kb.value_key(docseq); + + let mut buffer = Vec::with_capacity(value.len() + 1); + buffer.push(code as u8); + try!((&mut buffer as &mut Write).write_all(&value)); + + try!(batch.put(&key.into_bytes(), &buffer.as_ref())); + + Ok(()) + } + fn maybe_add_value(&mut self, parser: &Parser, code: char, value: &[u8], + docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + if self.ignore_children == 0 { + match self.extract_key(parser.stack().top()) { + ObjectKeyTypes::Id => { + return Err(Error::Shred( + "Expected string for `_id` field, got another type".to_string())); + }, + ObjectKeyTypes::Key(key) => { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.kb.pop_object_key(); + self.kb.push_object_key(&key); + try!(self.add_value(code, &value, docseq, batch)); + self.kb.inc_top_array_offset(); + }, + ObjectKeyTypes::NoKey => { + try!(self.add_value(code, &value, docseq, batch)); + self.kb.inc_top_array_offset(); + }, + ObjectKeyTypes::Ignore => { + self.ignore_children = 1; + }, + } + } + Ok(()) + } // Extract key if it exists and indicates if it's a special type of key fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { if self.kb.last_pushed_keypath_is_object_key() { @@ -156,6 +199,10 @@ impl Shredder { Result { let mut parser = Parser::new(json.chars()); let mut token = parser.next(); + + // this will keep track of objects where encountered keys. + // if we didn't encounter keys then the top most element will be false. + let mut object_keys_indexed = Vec::new(); loop { // Get the next token, so that in case of an `ObjectStart` the key is already // on the stack. @@ -167,6 +214,7 @@ impl Shredder { else { // Just push something to make `ObjectEnd` happy self.kb.push_object_key(""); + object_keys_indexed.push(false); } }, Some(JsonEvent::ObjectEnd) => { @@ -174,6 +222,11 @@ impl Shredder { self.ignore_children -= 1; } else { self.kb.pop_object_key(); + if !object_keys_indexed.pop().unwrap() { + // this means we never wrote a key because the object was empty. + // So preserve the empty object by writing a special value. + try!(self.maybe_add_value(&parser, 'o', &[], docseq, batch)); + } self.kb.inc_top_array_offset(); } }, @@ -189,7 +242,14 @@ impl Shredder { if self.ignore_children > 0 { self.ignore_children -= 1; } else { - self.kb.pop_array(); + if self.kb.peek_array_offset() == 0 { + // this means we never wrote a value because the object was empty. + // So preserve the empty array by writing a special value. + self.kb.pop_array(); + try!(self.maybe_add_value(&parser, 'a', &[], docseq, batch)); + } else { + self.kb.pop_array(); + } self.kb.inc_top_array_offset(); } }, @@ -201,6 +261,7 @@ impl Shredder { self.doc_id = value.clone(); self.kb.pop_object_key(); self.kb.push_object_key("_id"); + *object_keys_indexed.last_mut().unwrap() = true; try!(self.add_entries(&value, docseq, batch)); }, @@ -209,6 +270,7 @@ impl Shredder { // or the previous object key self.kb.pop_object_key(); self.kb.push_object_key(&key); + *object_keys_indexed.last_mut().unwrap() = true; try!(self.add_entries(&value, docseq, batch)); self.kb.inc_top_array_offset(); @@ -223,8 +285,32 @@ impl Shredder { } } }, - not_implemented => { - panic!("Not yet implemented other JSON types! {:?}", not_implemented); + Some(JsonEvent::BooleanValue(tf)) => { + let code = if tf {'T'} else {'F'}; + try!(self.maybe_add_value(&parser, code, &[], docseq, batch)); + }, + Some(JsonEvent::I64Value(i)) => { + let f = i as f64; + let bytes = unsafe{ transmute::(f) }; + try!(self.maybe_add_value(&parser, 'f', &bytes[..], docseq, batch)); + }, + Some(JsonEvent::U64Value(u)) => { + let f = u as f64; + let bytes = unsafe{ transmute::(f) }; + try!(self.maybe_add_value(&parser, 'f', &bytes[..], docseq, batch)); + }, + Some(JsonEvent::F64Value(f)) => { + let bytes = unsafe{ transmute::(f) }; + try!(self.maybe_add_value(&parser, 'f', &bytes[..], docseq, batch)); + }, + Some(JsonEvent::NullValue) => { + try!(self.maybe_add_value(&parser, 'N', &[], docseq, batch)); + }, + Some(JsonEvent::Error(error)) => { + return Err(Error::Shred(error.to_string())); + }, + None => { + break; } }; diff --git a/src/key_builder.rs b/src/key_builder.rs index d5c5137..25847a4 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -161,6 +161,7 @@ impl KeyBuilder { debug_assert!(self.keypath.last().unwrap().starts_with(".")); self.keypath.pop(); } + pub fn peek_object_key(&self) -> String { debug_assert!(self.keypath.last().unwrap().starts_with(".")); let x = KeyBuilder::parse_first_key_value_segment(&self.keypath.last().unwrap()); @@ -171,6 +172,11 @@ impl KeyBuilder { } } + pub fn peek_array_offset(&self) -> u64 { + debug_assert!(self.keypath.last().unwrap().starts_with("$")); + self.arraypath.last().unwrap().clone() + } + pub fn pop_array(&mut self) { debug_assert!(self.keypath.last().unwrap() == "$"); self.arraypath.pop(); diff --git a/src/query.rs b/src/query.rs index 236cfd8..32b9c82 100644 --- a/src/query.rs +++ b/src/query.rs @@ -806,7 +806,7 @@ impl RetValue { let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); JsonValue::String(string) }, - 'n' => { + 'f' => { assert!(bytes.len() == 9); let mut bytes2: [u8; 8] = [0; 8]; for (n, b) in bytes[1..9].iter().enumerate() { @@ -1080,6 +1080,7 @@ mod tests { let _ = index.add(r#"{"_id":"10", "A":"a bunch of words in this sentence"}"#); let _ = index.add(r#"{"_id":"11", "A":""}"#); let _ = index.add(r#"{"_id":"12", "A":["1","2","3","4","5","6","7","8","9","10","11","12"]}"#); + let _ = index.add(r#"{"_id":"13", "A":["foo",1,true,false,null,{},[]]}"#); index.flush().unwrap(); @@ -1183,6 +1184,10 @@ mod tests { return {foo:.A[0], bar: ._id} "#.to_string(), &index).unwrap(); assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"1","bar":"12"}"#.to_string())); assert_eq!(query_results.next_result().unwrap(), None); + query_results = Query::get_matches(r#"find {A:[ == "foo"]} + return .A "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["foo",1,true,false,null,{},[]]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); } From 2b1827d9fe7a9fc9a327894ad0c344f84faf3fd5 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 24 Dec 2016 12:31:48 -0800 Subject: [PATCH 048/122] Fix broken unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Didn’t notice they were broken because I was accidentally only running the query tests. Oops. --- src/json_shred.rs | 30 ++++++++++++++++-------------- src/key_builder.rs | 6 +++++- src/stems.rs | 18 +++++++++--------- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 1b96986..4ddf415 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -334,21 +334,23 @@ mod tests { fn wordinfos_from_rocks(rocks: rocksdb::DB) -> Vec<(String, Vec)> { let mut result = Vec::new(); for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - let mut wordinfos = Vec::new(); - for wi in payload.get_wordinfos().unwrap().iter() { - wordinfos.push(WordInfo{ - word_pos: wi.get_word_pos(), - suffix_text: wi.get_suffix_text().unwrap().to_string(), - suffix_offset: wi.get_suffix_offset(), - }); + if key[0] as char == 'W' { + let mut ref_value = &*value; + let message_reader = ::capnp::serialize_packed::read_message( + &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + let payload = message_reader.get_root::().unwrap(); + + let mut wordinfos = Vec::new(); + for wi in payload.get_wordinfos().unwrap().iter() { + wordinfos.push(WordInfo{ + word_pos: wi.get_word_pos(), + suffix_text: wi.get_suffix_text().unwrap().to_string(), + suffix_offset: wi.get_suffix_offset(), + }); + } + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + result.push((key_string, wordinfos)); } - let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); - result.push((key_string, wordinfos)); } result } diff --git a/src/key_builder.rs b/src/key_builder.rs index 25847a4..f017141 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -194,7 +194,11 @@ impl KeyBuilder { } pub fn last_pushed_keypath_is_object_key(&self) -> bool { - self.keypath.last().unwrap().starts_with(".") + if self.keypath.is_empty() { + false + } else { + self.keypath.last().unwrap().starts_with(".") + } } pub fn keypath_segments_len(&self) -> usize { diff --git a/src/stems.rs b/src/stems.rs index df84e07..94a867a 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -145,18 +145,18 @@ mod tests { let expected = vec![ StemmedWord { word_pos: 0, suffix_offset: 0, stemmed: String::from("these"), suffix: String::from("THEse ") }, - StemmedWord { word_pos: 6, suffix_offset: 6, + StemmedWord { word_pos: 1, suffix_offset: 0, stemmed: String::from("word"), suffix: String::from("Words ") }, // "deeply" stems to "deepli" - StemmedWord { word_pos: 12, suffix_offset: 17, + StemmedWord { word_pos: 2, suffix_offset: 5, stemmed: String::from("deepli"), suffix: String::from("Y ") }, - StemmedWord { word_pos: 19, suffix_offset: 23, + StemmedWord { word_pos: 3, suffix_offset: 4, stemmed: String::from("test"), suffix: String::from(" ") }, - StemmedWord { word_pos: 24, suffix_offset: 26, + StemmedWord { word_pos: 4, suffix_offset: 2, stemmed: String::from("smooth"), suffix: String::from("OOthly ") }, - StemmedWord { word_pos: 33, suffix_offset: 37, + StemmedWord { word_pos: 5, suffix_offset: 4, stemmed: String::from("that"), suffix: String::from(" ") }, - StemmedWord { word_pos: 38, suffix_offset: 42, + StemmedWord { word_pos: 6, suffix_offset: 4, stemmed: String::from("stem"), suffix: String::from("mING") }, ]; assert_eq!(result.len(), expected.len()); @@ -182,9 +182,9 @@ mod tests { let expected = vec![ StemmedWord { word_pos: 0, suffix_offset: 6, stemmed: String::from("@!? "), suffix: String::from("") }, - StemmedWord { word_pos: 6, suffix_offset: 6, + StemmedWord { word_pos: 1, suffix_offset: 0, stemmed: String::from("let"), suffix: String::from("Let's ") }, - StemmedWord { word_pos: 12, suffix_offset: 15, + StemmedWord { word_pos: 2, suffix_offset: 3, stemmed: String::from("see"), suffix: String::from("ing...") }, ]; assert_eq!(result.len(), expected.len()); @@ -200,7 +200,7 @@ mod tests { let expected = vec![ StemmedWord { word_pos: 0, suffix_offset: 0, stemmed: String::from("ünicöd"), suffix: String::from("Ünicöde ") }, - StemmedWord { word_pos: 10, suffix_offset: 14, + StemmedWord { word_pos: 1, suffix_offset: 4, stemmed: String::from("stem"), suffix: String::from("ming") }, ]; assert_eq!(result.len(), expected.len()); From 46da627b0d8e2574a00ff520be64f79359cd8389 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 24 Dec 2016 12:32:54 -0800 Subject: [PATCH 049/122] Fix parsing object paths to disallow strings in the wrong place. --- src/query.rs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/query.rs b/src/query.rs index 32b9c82..9e8dd22 100644 --- a/src/query.rs +++ b/src/query.rs @@ -166,7 +166,17 @@ impl<'a> Parser<'a> { self.query[self.offset..].starts_with(token) } - fn consume_field(&mut self) -> Result, Error> { + fn consume_key(&mut self) -> Result, Error> { + if let Some(key) = self.consume_field() { + Ok(Some(key)) + } else if let Some(key) = try!(self.consume_string_literal()) { + Ok(Some(key)) + } else { + Ok(None) + } + } + + fn consume_field(&mut self) -> Option { let mut result = String::new(); { let mut chars = self.query[self.offset..].chars(); @@ -187,9 +197,9 @@ impl<'a> Parser<'a> { if result.len() > 0 { self.offset += result.len(); self.ws(); - Ok(Some(result)) + Some(result) } else { - self.consume_string_literal() + None } } @@ -218,7 +228,7 @@ impl<'a> Parser<'a> { try!(self.must_consume("]")); key } else { - if let Some(key) = try!(self.consume_field()) { + if let Some(key) = self.consume_field() { key } else { self.ws(); @@ -241,7 +251,7 @@ impl<'a> Parser<'a> { } try!(self.must_consume("]")); } else if self.consume(".") { - if let Some(key) = try!(self.consume_field()) { + if let Some(key) = self.consume_field() { kb.push_object_key(&key); } else { return Err(Error::Parse("Expected object key.".to_string())); @@ -420,7 +430,7 @@ ws1 fn ocompare<'b>(&'b mut self) -> Result, Error> { if let Some(filter) = try!(self.oparens()) { Ok(filter) - } else if let Some(field) = try!(self.consume_field()) { + } else if let Some(field) = try!(self.consume_key()) { self.kb.push_object_key(&field); try!(self.must_consume(":")); if let Some(filter) = try!(self.oparens()) { @@ -582,7 +592,7 @@ ws1 try!(self.must_consume("{")); let mut fields: Vec<(String, Box)> = Vec::new(); loop { - if let Some(field) = try!(self.consume_field()) { + if let Some(field) = try!(self.consume_key()) { try!(self.must_consume(":")); if let Some(ret_value) = try!(self.ret_value()) { fields.push((field, ret_value)); From a24bf7cfa7ca2e96b46748491811321816b8ef1b Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 30 Dec 2016 19:45:24 -0800 Subject: [PATCH 050/122] added group, sort, avg, sum, min, max, count, concat, list Also can add default= for values that are missing. See test_query_group and test_query_json_collation in query.rs for details. --- Cargo.toml | 2 +- src/error.rs | 13 + src/json_shred.rs | 61 +- src/key_builder.rs | 24 - src/query.rs | 1728 +++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 1661 insertions(+), 167 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 63b292c..36b4bc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ build = "build.rs" [dependencies] capnp = "0.7.4" -rustc-serialize= "0.3.19" +rustc-serialize = "0.3.19" stemmer = "0.3.2" unicode-normalization = "0.1.2" unicode-segmentation = "0.1.2" diff --git a/src/error.rs b/src/error.rs index 55f286c..59698e6 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,6 +3,7 @@ extern crate rocksdb; use std::{error, fmt}; use std::num::ParseIntError; +use std::num::ParseFloatError; use std::io; @@ -16,6 +17,12 @@ pub enum Error { Io(io::Error), } +impl PartialEq for Error { + fn eq(&self, other: &Error) -> bool { + self == other + } +} + impl error::Error for Error { fn description(&self) -> &str { match *self { @@ -63,6 +70,12 @@ impl From for Error { } } +impl From for Error { + fn from(err: ParseFloatError) -> Error { + Error::Parse(err.to_string()) + } +} + impl From for Error { fn from(err: io::Error) -> Error { Error::Io(err) diff --git a/src/json_shred.rs b/src/json_shred.rs index 4ddf415..bedf7bd 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -100,7 +100,6 @@ impl Shredder { } let key = self.kb.value_key(docseq); - let mut buffer = String::with_capacity(text.len() + 1); buffer.push('s'); buffer.push_str(&text); @@ -113,7 +112,6 @@ impl Shredder { fn add_value(&mut self, code: char, value: &[u8], docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { let key = self.kb.value_key(docseq); - let mut buffer = Vec::with_capacity(value.len() + 1); buffer.push(code as u8); try!((&mut buffer as &mut Write).write_all(&value)); @@ -122,6 +120,7 @@ impl Shredder { Ok(()) } + fn maybe_add_value(&mut self, parser: &Parser, code: char, value: &[u8], docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { if self.ignore_children == 0 { @@ -137,7 +136,6 @@ impl Shredder { self.kb.push_object_key(&key); try!(self.add_value(code, &value, docseq, batch)); - self.kb.inc_top_array_offset(); }, ObjectKeyTypes::NoKey => { try!(self.add_value(code, &value, docseq, batch)); @@ -152,44 +150,38 @@ impl Shredder { } // Extract key if it exists and indicates if it's a special type of key fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { - if self.kb.last_pushed_keypath_is_object_key() { - match stack_element { - Some(StackElement::Key(key)) => { - if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { - if key == "_id" { - ObjectKeyTypes::Id - } else { - ObjectKeyTypes::Ignore - } + match stack_element { + Some(StackElement::Key(key)) => { + if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { + if key == "_id" { + ObjectKeyTypes::Id } else { - ObjectKeyTypes::Key(key.to_string()) + ObjectKeyTypes::Ignore } - }, - _ => ObjectKeyTypes::NoKey, - } - } else { - ObjectKeyTypes::NoKey + } else { + ObjectKeyTypes::Key(key.to_string()) + } + }, + _ => ObjectKeyTypes::NoKey, } } // If we are inside an object we need to push the key to the key builder // Don't push them if they are reserved fields (starting with underscore) fn maybe_push_key(&mut self, stack_element: Option) -> Result<(), Error> { - if self.kb.last_pushed_keypath_is_object_key() { - if let Some(StackElement::Key(key)) = stack_element { - if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { - if key == "_id" { - return Err(Error::Shred( - "Expected string for `_id` field, got another type".to_string())); - } else { - self.ignore_children = 1; - } + if let Some(StackElement::Key(key)) = stack_element { + if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { + if key == "_id" { + return Err(Error::Shred( + "Expected string for `_id` field, got another type".to_string())); } else { - // Pop the dummy object that makes ObjectEnd happy - // or the previous object key - self.kb.pop_object_key(); - self.kb.push_object_key(key); + self.ignore_children = 1; } + } else { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.kb.pop_object_key(); + self.kb.push_object_key(key); } } Ok(()) @@ -212,6 +204,7 @@ impl Shredder { self.ignore_children += 1; } else { + try!(self.maybe_push_key(parser.stack().top())); // Just push something to make `ObjectEnd` happy self.kb.push_object_key(""); object_keys_indexed.push(false); @@ -231,10 +224,10 @@ impl Shredder { } }, Some(JsonEvent::ArrayStart) => { - try!(self.maybe_push_key(parser.stack().top())); if self.ignore_children > 0 { self.ignore_children += 1; } else { + try!(self.maybe_push_key(parser.stack().top())); self.kb.push_array(); } }, @@ -273,7 +266,6 @@ impl Shredder { *object_keys_indexed.last_mut().unwrap() = true; try!(self.add_entries(&value, docseq, batch)); - self.kb.inc_top_array_offset(); }, ObjectKeyTypes::NoKey => { try!(self.add_entries(&value, docseq, batch)); @@ -315,9 +307,6 @@ impl Shredder { }; token = parser.next(); - if token == None { - break; - } } Ok(self.doc_id.clone()) } diff --git a/src/key_builder.rs b/src/key_builder.rs index f017141..4a64b68 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -193,14 +193,6 @@ impl KeyBuilder { self.arraypath.len() } - pub fn last_pushed_keypath_is_object_key(&self) -> bool { - if self.keypath.is_empty() { - false - } else { - self.keypath.last().unwrap().starts_with(".") - } - } - pub fn keypath_segments_len(&self) -> usize { self.keypath.len() } @@ -304,22 +296,6 @@ mod tests { assert_eq!(kb.keypath_segments_len(), 0, "No segments so far"); } - #[test] - fn test_last_pushed_segment_type() { - let mut kb = KeyBuilder::new(); - assert_eq!(kb.keypath_segments_len(), 0, "No segments"); - - kb.push_object_key("first"); - assert!(kb.last_pushed_keypath_is_object_key(), "Last segment is an object key"); - - kb.push_object_key("second"); - assert!(kb.last_pushed_keypath_is_object_key(), "Last segment is an object key"); - - kb.push_array(); - assert!(!kb.last_pushed_keypath_is_object_key(), "Last segment is an array"); -; - } - #[test] fn test_doc_result_parse() { let key = "W.foo$.bar$!word#123,1,0".to_string(); diff --git a/src/query.rs b/src/query.rs index 9e8dd22..ba3e22f 100644 --- a/src/query.rs +++ b/src/query.rs @@ -6,9 +6,10 @@ use std::cmp::Ordering; use std::io::Write; use std::collections::HashMap; use std::iter::Peekable; -use std::mem::transmute; +use std::mem::{transmute, swap}; use std::collections::VecDeque; use std::iter::Iterator; +use std::usize; use error::Error; use index::Index; @@ -50,6 +51,81 @@ impl Ord for DocResult { pub struct Query {} +impl Query { + pub fn get_matches<'a>(query: String, index: &'a Index) -> Result, Error> { + match index.rocks { + Some(ref rocks) => { + let snapshot = Snapshot::new(&rocks); + let mut parser = Parser::new(query, snapshot); + let filter = try!(parser.build_filter()); + let mut sorts = try!(parser.sort_clause()); + let mut returnable = try!(parser.return_clause()); + let limit = try!(parser.limit_clause()); + + let mut ags = Vec::new(); + returnable.get_aggregate_funs(&mut ags); + + let mut has_ags = false; + for option_ag in ags.iter() { + if option_ag.is_some() { + has_ags = true; + break; + } + } + let has_sorting = !sorts.is_empty(); + + returnable = if has_sorting && has_ags { + return Err(Error::Parse("Cannot have aggregates and sorting in the same query" + .to_string())); + } else if has_sorting { + returnable.take_sort_for_matching_fields(&mut sorts); + if !sorts.is_empty() { + let vec = sorts.into_iter() + .map(|(_key, sort_info)| + RetValue {kb: sort_info.kb, + ag: None, + default: sort_info.default, + sort: Some(sort_info.sort)}) + .collect(); + Box::new(RetHidden{unrendered: vec, visible: returnable}) + } else { + returnable + } + } else { + returnable + }; + + let option_ags = if has_ags { + // we have at least one AggregationFun. Make sure they are all set. + for option_ag in ags.iter() { + if option_ag.is_none() { + return Err(Error::Parse("Return keypaths must either all have \ + aggregate functions, or none can them.".to_string())); + } + } + Some(ags.into_iter().map(|option| option.unwrap()).collect()) + } else { + None + }; + + let sorting = if has_sorting { + let mut sorting = Vec::new(); + returnable.get_sorting(&mut sorting); + Some(sorting) + } else { + None + }; + + Ok(QueryResults::new(filter, parser.snapshot, returnable, + option_ags, sorting, limit)) + }, + None => { + Err(Error::Parse("You must open the index first".to_string())) + }, + } + } +} + pub struct QueryResults<'a> { filter: Box, doc_result_next: DocResult, @@ -57,24 +133,100 @@ pub struct QueryResults<'a> { iter: DBIterator, returnable: Box, buffer: Vec, + needs_sorting_and_ags: bool, + done_with_sorting_and_ags: bool, + does_group_or_aggr: bool, + sorts: Option>, + aggr_inits: Vec<(fn (&mut JsonValue), usize)>, + aggr_actions: Vec<(fn (&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, + aggr_finals: Vec<(fn (&mut JsonValue), usize)>, + in_buffer: Vec>, + sorted_buffer: Vec>, + limit: usize, } impl<'a> QueryResults<'a> { fn new(filter: Box, snapshot: Snapshot<'a>, - returnable: Box) -> QueryResults<'a> { - let iter = snapshot.iterator(IteratorMode::Start); + returnable: Box, + ags: Option>, + sorting: Option>>, + limit: usize) -> QueryResults<'a> { + + // the input args for sorts and ags are vecs where the slot is the same slot as + // a result that the action needs to be applied to. We instead convert them + // into several new fields with tuples of action and the slot to act on. + // this way we don't needlesss loop over the actions where most are noops + + // only one can be Some at a time + debug_assert!(!sorting.is_some() && !ags.is_some() || sorting.is_some() ^ ags.is_some()); + + let needs_sorting_and_ags = ags.is_some() || sorting.is_some(); + + let mut sorts = Vec::new(); + if let Some(mut sorting) = sorting { + let mut n = sorting.len(); + while let Some(option) = sorting.pop() { + n -= 1; + if let Some(sort_dir) = option { + sorts.push((sort_dir, n)); + } + } + // order we process sorts is important + sorts.reverse(); + } + let mut does_group_or_aggr = false; + let mut aggr_inits = Vec::new(); + let mut aggr_actions = Vec::new(); + let mut aggr_finals = Vec::new(); + if let Some(mut ags) = ags { + does_group_or_aggr = true; + let mut n = ags.len(); + while let Some((ag, user_arg)) = ags.pop() { + n -= 1; + if ag == AggregateFun::GroupAsc { + sorts.push((Sort::Asc, n)); + } else if ag == AggregateFun::GroupDesc { + sorts.push((Sort::Desc, n)); + } else { + let ag_impls = ag.get_fun_impls(); + if let Some(init) = ag_impls.init { + aggr_inits.push((init, n)); + } + if let Some(extract) = ag_impls.extract { + aggr_finals.push((extract, n)); + } + aggr_actions.push((ag_impls.action, user_arg, n)); + } + } + // the order we process groups in important + sorts.reverse(); + } + QueryResults{ filter: filter, doc_result_next: DocResult::new(), + iter: snapshot.iterator(IteratorMode::Start), snapshot: snapshot, - iter: iter, returnable: returnable, buffer: Vec::new(), + needs_sorting_and_ags: needs_sorting_and_ags, + done_with_sorting_and_ags: false, + does_group_or_aggr: does_group_or_aggr, + sorts: Some(sorts), + aggr_inits: aggr_inits, + aggr_actions: aggr_actions, + aggr_finals: aggr_finals, + in_buffer: Vec::new(), + sorted_buffer: Vec::new(), + limit: limit, } } fn get_next(&mut self) -> Result, Error> { + if self.done_with_sorting_and_ags { + return Ok(None); + } let result = try!(self.filter.first_result(&self.doc_result_next)); match result { Some(doc_result) => { @@ -101,15 +253,235 @@ impl<'a> QueryResults<'a> { } pub fn next_result(&mut self) -> Result, Error> { - let seq = match try!(self.get_next()) { - Some(seq) => seq, - None => return Ok(None), - }; - let bind = HashMap::new(); - let mut results = VecDeque::new(); - try!(self.returnable.fetch_result(&mut self.iter, seq, &bind, &mut results)); - try!(self.returnable.write_result(&mut results, &mut self.buffer)); - Ok(Some(unsafe{str::from_utf8_unchecked(&self.buffer[..])}.to_string())) + if self.needs_sorting_and_ags { + loop { + match if self.done_with_sorting_and_ags { None } else { try!(self.get_next()) } { + Some(seq) => { + let bind = HashMap::new(); + let mut results = VecDeque::new(); + try!(self.returnable.fetch_result(&mut self.iter, seq, + &bind, &mut results)); + self.in_buffer.push(results); + if self.in_buffer.len() == self.limit { + self.do_sorting_and_ags(); + } + }, + None => { + if !self.done_with_sorting_and_ags { + self.do_sorting_and_ags(); + self.done_with_sorting_and_ags = true; + if !self.aggr_finals.is_empty() { + // need to finalize the values + for end in self.sorted_buffer.iter_mut() { + for &(ref finalize, n) in self.aggr_finals.iter() { + (finalize)(&mut end[n]); + } + } + } + } + if let Some(mut result) = self.sorted_buffer.pop() { + self.buffer.clear(); + try!(self.returnable.write_result(&mut result, &mut self.buffer)); + let str = unsafe{str::from_utf8_unchecked(&self.buffer[..])}; + return Ok(Some(str.to_string())); + } else { + return Ok(None); + } + }, + } + } + } else { + let seq = match try!(self.get_next()) { + Some(seq) => seq, + None => return Ok(None), + }; + let bind = HashMap::new(); + let mut results = VecDeque::new(); + try!(self.returnable.fetch_result(&mut self.iter, seq, &bind, &mut results)); + self.buffer.clear(); + try!(self.returnable.write_result(&mut results, &mut self.buffer)); + Ok(Some(unsafe{str::from_utf8_unchecked(&self.buffer[..])}.to_string())) + } + } + + fn cmp_results(sorts: &Vec<(Sort, usize)>, + a: &VecDeque, b: &VecDeque) -> Ordering { + for &(ref sort_dir, n) in sorts.iter() { + let cmp = if *sort_dir != Sort::Desc { + b[n].cmp(&a[n]) + } else { + a[n].cmp(&b[n]) + }; + + if cmp != Ordering::Equal { + return cmp; + } + } + Ordering::Equal + } + + fn do_sorting_and_ags(&mut self) { + // ugh borrow check madness means this is how this must happen. + // we need to put it back before returning. + let sorts = self.sorts.take().unwrap(); + if !sorts.is_empty() { + self.in_buffer.sort_by(|a, b| QueryResults::cmp_results(&sorts, &a, &b)); + } + // put back + self.sorts = Some(sorts); + + if !self.does_group_or_aggr { + if self.sorted_buffer.is_empty() { + swap(&mut self.sorted_buffer, &mut self.in_buffer); + } else { + //merge the sorted buffers + let mut new_buffer = Vec::with_capacity(self.sorted_buffer.len() + + self.in_buffer.len()); + let mut option_a = self.sorted_buffer.pop(); + let mut option_b = self.in_buffer.pop(); + // take out for borrow check + let sorts = self.sorts.take().unwrap(); + loop { + match (option_a, option_b) { + (Some(a), Some(b)) => { + match QueryResults::cmp_results(&sorts, &a, &b) { + Ordering::Less => { + new_buffer.push(b); + option_a = Some(a); + option_b = self.in_buffer.pop(); + }, + Ordering::Greater => { + new_buffer.push(a); + option_a = self.sorted_buffer.pop(); + option_b = Some(b); + + }, + Ordering::Equal => { + new_buffer.push(a); + new_buffer.push(b); + option_a = self.sorted_buffer.pop(); + option_b = self.in_buffer.pop(); + } + } + if new_buffer.len() >= self.limit { + self.sorted_buffer.clear(); + self.in_buffer.clear(); + new_buffer.truncate(self.limit); + break; + } + }, + (Some(a), None) => { + new_buffer.push(a); + if new_buffer.len() == self.limit { + break; + } + while let Some(a) = self.sorted_buffer.pop() { + new_buffer.push(a); + if new_buffer.len() == self.limit { + break; + } + } + break; + }, + (None, Some(b)) => { + new_buffer.push(b); + if new_buffer.len() == self.limit { + break; + } + while let Some(b) = self.in_buffer.pop() { + new_buffer.push(b); + if new_buffer.len() == self.limit { + break; + } + } + break; + }, + (None, None) => break, + } + } + // put back + self.sorts = Some(sorts); + + new_buffer.reverse(); + swap(&mut self.sorted_buffer, &mut new_buffer); + } + return; + } + + + //merge the sorted buffers + let mut new_buffer = Vec::with_capacity(self.sorted_buffer.len() + + self.in_buffer.len()); + let mut option_old = self.sorted_buffer.pop(); + let mut option_new = self.in_buffer.pop(); + // take out for borrow check + let sorts = self.sorts.take().unwrap(); + loop { + match (option_old, option_new) { + (Some(mut old), Some(mut new)) => { + match QueryResults::cmp_results(&sorts, &old, &new) { + Ordering::Less => { + for &(ref init, n) in self.aggr_inits.iter() { + (init)(&mut new[n]); + } + //push back old value into sorted_buffer, + //then use new value as old value. + self.sorted_buffer.push(old); + option_old = Some(new); + option_new = self.in_buffer.pop(); + }, + Ordering::Greater => { + new_buffer.push(old); + option_old = self.sorted_buffer.pop(); + option_new = Some(new); + }, + Ordering::Equal => { + for &(ref action, ref user_arg, n) in self.aggr_actions.iter() { + // we can't swap out a value of new directly, so this lets us + // without shifting or cloning values, both of which can be + // expensive + let mut new_n = JsonValue::Null; + swap(&mut new_n, &mut new[n]); + (action)(&mut old[n], new_n, &user_arg); + } + option_old = Some(old); + option_new = self.in_buffer.pop(); + } + } + if new_buffer.len() == self.limit { + self.sorted_buffer.clear(); + self.in_buffer.clear(); + break; + } + }, + (Some(old), None) => { + new_buffer.push(old); + if new_buffer.len() == self.limit { + break; + } + while let Some(old) = self.sorted_buffer.pop() { + new_buffer.push(old); + if new_buffer.len() == self.limit { + break; + } + } + break; + }, + (None, Some(mut new)) => { + for &(ref init, n) in self.aggr_inits.iter() { + (init)(&mut new[n]); + } + option_old = Some(new); + option_new = self.in_buffer.pop(); + }, + (None, None) => break, + } + } + // put back + self.sorts = Some(sorts); + + new_buffer.reverse(); + swap(&mut self.sorted_buffer, &mut new_buffer); } } @@ -221,6 +593,84 @@ impl<'a> Parser<'a> { } } + fn consume_default(&mut self) -> Result, Error> { + if self.consume("default") { + try!(self.must_consume("=")); + if let Some(json) = try!(self.json()) { + Ok(Some(json)) + } else { + Err(Error::Parse("Expected json value for default".to_string())) + } + } else { + Ok(None) + } + } + + fn consume_aggregate(&mut self) -> Result, Error> { + let offset = self.offset; + let mut aggregate_fun = if self.consume("group") { + AggregateFun::GroupAsc + } else if self.consume("sum") { + AggregateFun::Sum + } else if self.consume("max") { + AggregateFun::Max + } else if self.consume("min") { + AggregateFun::Min + } else if self.consume("list") { + AggregateFun::List + } else if self.consume("concat") { + AggregateFun::Concat + } else if self.consume("avg") { + AggregateFun::Avg + } else if self.consume("count") { + AggregateFun::Count + } else { + return Ok(None) + }; + + if self.consume("(") { + if aggregate_fun == AggregateFun::Count { + try!(self.must_consume(")")); + Ok(Some((aggregate_fun, KeyBuilder::new(), JsonValue::Null))) + } else if aggregate_fun == AggregateFun::Concat { + if let Some(kb) = try!(self.consume_keypath()) { + let json = if self.consume("sep") { + try!(self.must_consume("=")); + JsonValue::String(try!(self.must_consume_string_literal())) + } else { + JsonValue::String(",".to_string()) + }; + try!(self.must_consume(")")); + Ok(Some((aggregate_fun, kb, json))) + } else { + Err(Error::Parse("Expected keypath or bind variable".to_string())) + } + } else if let Some(kb) = try!(self.consume_keypath()) { + if self.consume("order") { + try!(self.must_consume("=")); + if self.consume("asc") { + aggregate_fun = AggregateFun::GroupAsc; + } else if self.consume("desc") { + aggregate_fun = AggregateFun::GroupDesc; + } else { + return Err(Error::Parse("Expected asc or desc".to_string())); + } + } + try!(self.must_consume(")")); + + Ok(Some((aggregate_fun, kb, JsonValue::Null))) + } else { + Err(Error::Parse("Expected keypath or bind variable".to_string())) + } + } else { + // this consumed word above might be a Bind var. Unconsume and return nothing. + self.offset = offset; + Ok(None) + } + } + fn consume_keypath(&mut self) -> Result, Error> { let key: String = if self.consume(".") { if self.consume("[") { @@ -264,6 +714,147 @@ impl<'a> Parser<'a> { Ok(Some(kb)) } + fn consume_number(&mut self) -> Result, Error> { + // Yes this parsing code is hideously verbose. But it conforms exactly to the json spec + // and uses the rust f64 parser, which can't tell us how many characters it used or needs. + + // At the end it then uses the std rust String::parse() method to parse and return + // the f64 value and advance the self.offset. The rust method is a super set of the + // allowable json syntax, so it will parse any valid json floating point number. It might + // return an error if the number is out of bounds. + let mut result = String::new(); + 'outer: loop { + // this loop isn't a loop, it's just there to scope the self borrow + // and then jump to the end to do another borrow (self.ws()) + let mut chars = self.query[self.offset..].chars(); + let mut c = if let Some(c) = chars.next() { + c + } else { + return Ok(None); + }; + + // parse the sign + c = if c == '-' { + result.push('-'); + if let Some(c) = chars.next() { c } else {return Ok(None); } + } else { + c + }; + + // parse the first digit + let mut leading_zero = false; + c = if c == '0' { + result.push('0'); + leading_zero = true; + if let Some(c) = chars.next() { c } else {return Ok(None); } + } else if c >= '1' && c <= '9' { + result.push(c); + if let Some(c) = chars.next() { c } else {return Ok(None); } + } else if result.is_empty() { + // no sign or digits found. not a number + return Ok(None); + } else { + return Err(Error::Parse("Expected digits after sign (-).".to_string())); + }; + + // parse remaning significant digits + if !leading_zero { + // no more digits allowed if first digit is zero + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + break; + }; + } + } + + // parse decimal + c = if c == '.' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected digits after decimal point.".to_string())); + } + } else { + break 'outer; + }; + + // parse mantissa + let mut found_mantissa = false; + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + found_mantissa = true; + + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + if found_mantissa { + break; + } + return Err(Error::Parse("Expected digits after decimal point.".to_string())); + }; + } + + // parse exponent symbol + c = if c == 'e' || c == 'E' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } else { + break 'outer; + }; + + // parse exponent sign + c = if c == '+' || c == '-' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } else { + c + }; + + // parse exponent digits + let mut found_exponent = false; + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + found_exponent = true; + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + if found_exponent { + break 'outer; + } + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } + } + + self.offset += result.len(); + self.ws(); + Ok(Some(try!(result.parse()))) + } + fn must_consume_string_literal(&mut self) -> Result { if let Some(string) = try!(self.consume_string_literal()) { @@ -275,44 +866,71 @@ impl<'a> Parser<'a> { fn consume_string_literal(&mut self) -> Result, Error> { let mut lit = String::new(); - let mut next_is_special_char = false; - if self.could_consume("\"") { - // can't consume("\"") the leading quote because it will also skip leading whitespace - // inside the string literal - self.offset += 1; - for char in self.query[self.offset..].chars() { - if next_is_special_char { - match char { - '\\' | '"' => lit.push(char), - 'n' => lit.push('\n'), - 'b' => lit.push('\x08'), - 'r' => lit.push('\r'), - 'f' => lit.push('\x0C'), - 't' => lit.push('\t'), - 'v' => lit.push('\x0B'), - _ => return Err(Error::Parse(format!("Unknown character escape: {}", - char))), - }; - self.offset += 1; - next_is_special_char = false; + if !self.could_consume("\"") { + return Ok(None); + } + // can't consume("\"") the leading quote because it will also skip leading whitespace + // inside the string literal + self.offset += 1; + { + let mut chars = self.query[self.offset..].chars(); + 'outer: loop { + let char = if let Some(char) = chars.next() { + char + } else { + break; + }; + if char == '\\' { + self.offset += 1; + + let char = if let Some(char) = chars.next() { + char } else { - if char == '"' { - break; - } else if char == '\\' { - next_is_special_char = true; - self.offset += 1; - } else { - lit.push(char); - self.offset += char.len_utf8(); - } + break; + }; + match char { + '\\' | '"' | '/' => lit.push(char), + 'n' => lit.push('\n'), + 'b' => lit.push('\x08'), + 'r' => lit.push('\r'), + 'f' => lit.push('\x0C'), + 't' => lit.push('\t'), + 'v' => lit.push('\x0B'), + 'u' => { + let mut n = 0; + for _i in 0..4 { + let char = if let Some(char) = chars.next() { + char + } else { + break 'outer; + }; + n = match char { + c @ '0' ... '9' => n * 16 + ((c as u16) - ('0' as u16)), + c @ 'a' ... 'f' => n * 16 + (10 + (c as u16) - ('a' as u16)), + c @ 'A' ... 'F' => n * 16 + (10 + (c as u16) - ('A' as u16)), + _ => return Err(Error::Parse(format!( + "Invalid hexidecimal escape: {}", char))), + }; + + } + self.offset += 3; // 3 because 1 is always added after the match below + }, + _ => return Err(Error::Parse(format!("Unknown character escape: {}", + char))), + }; + self.offset += 1; + } else { + if char == '"' { + break; + } else { + lit.push(char); + self.offset += char.len_utf8(); } } - try!(self.must_consume("\"")); - self.ws(); - Ok(Some(lit)) - } else { - Ok(None) } + } + try!(self.must_consume("\"")); + Ok(Some(lit)) } /* @@ -574,6 +1192,55 @@ ws1 Ok(filter) } + fn sort_clause(&mut self) -> Result, Error> { + let mut sort_infos = HashMap::new(); + if self.consume("sort") { + loop { + if let Some(kb) = try!(self.consume_keypath()) { + // doing the search for source 2x so user can order + // anyway they like. Yes it's a hack, but it simple. + let mut sort = if self.consume("asc") { + Sort::Asc + } else if self.consume("desc") { + Sort::Desc + } else { + Sort::Asc + }; + + let default = if self.consume("default") { + try!(self.must_consume("=")); + if let Some(json) = try!(self.json()) { + json + } else { + return Err(Error::Parse("Expected Json after default.".to_string())); + } + } else { + JsonValue::Null + }; + + sort = if self.consume("asc") { + Sort::Asc + } else if self.consume("desc") { + Sort::Desc + } else { + sort + }; + + sort_infos.insert(kb.value_key(0), SortInfo{kb:kb, + sort:sort, + default:default}); + if !self.consume(",") { + break; + } + } + } + if sort_infos.is_empty() { + return Err(Error::Parse("Expected field path in sort expression.".to_string())); + } + } + Ok(sort_infos) + } + fn return_clause(&mut self) -> Result, Error> { if self.consume("return") { if let Some(ret_value) = try!(self.ret_value()) { @@ -584,7 +1251,7 @@ ws1 } else { let mut kb = KeyBuilder::new(); kb.push_object_key("_id"); - Ok(Box::new(RetValue{kb:kb})) + Ok(Box::new(RetValue{kb: kb, ag:None, default: JsonValue::Null, sort: None})) } } @@ -608,9 +1275,6 @@ ws1 } try!(self.must_consume("}")); - if fields.is_empty() { - return Err(Error::Parse("Found empty object in return.".to_string())); - } Ok(Box::new(RetObject{fields: fields})) } @@ -628,41 +1292,330 @@ ws1 } } try!(self.must_consume("]")); - if slots.is_empty() { - return Err(Error::Parse("Found empty array in return.".to_string())); - } Ok(Box::new(RetArray{slots: slots})) } fn ret_value(&mut self) -> Result>, Error> { - if let Some(kb) = try!(self.consume_keypath()) { - Ok(Some(Box::new(RetValue{kb: kb}))) + if let Some((ag, kb, json)) = try!(self.consume_aggregate()) { + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + Ok(Some(Box::new(RetValue{kb: kb, ag: Some((ag, json)), + default: default, sort:None}))) + } + else if let Some(kb) = try!(self.consume_keypath()) { + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + Ok(Some(Box::new(RetValue{kb: kb, ag: None, default: default, sort: None}))) } else if self.could_consume("{") { Ok(Some(try!(self.ret_object()))) } else if self.could_consume("[") { Ok(Some(try!(self.ret_array()))) + } else if let Some(string) = try!(self.consume_string_literal()) { + Ok(Some(Box::new(RetLiteral{json: JsonValue::String(string)}))) + } else if let Some(num) = try!(self.consume_number()) { + Ok(Some(Box::new(RetLiteral{json: JsonValue::Number(num)}))) } else { - Ok(None) + if self.consume("true") { + Ok(Some(Box::new(RetLiteral{json: JsonValue::True}))) + } else if self.consume("false") { + Ok(Some(Box::new(RetLiteral{json: JsonValue::False}))) + } else if self.consume("null") { + Ok(Some(Box::new(RetLiteral{json: JsonValue::Null}))) + } else { + Ok(None) + } } } - fn build_filter(&mut self) -> Result, Error> { - self.ws(); - Ok(try!(self.find())) + fn limit_clause(&mut self) -> Result { + if self.consume("limit") { + if let Some(i) = try!(self.consume_integer()) { + if i <= 0 { + return Err(Error::Parse("limit must be an integer greater than 0" + .to_string())); + } + Ok(i as usize) + } else { + return Err(Error::Parse("limit expects an integer greater than 0" + .to_string())); + } + } else { + Ok(usize::MAX) + } } -} - -pub trait Returnable { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, - bind_var_keys: &HashMap, - result: &mut VecDeque>) -> Result<(), Error>; + fn json(&mut self) -> Result, Error> { + if self.could_consume("{") { + Ok(Some(try!(self.json_object()))) + } else if self.could_consume("[") { + Ok(Some(try!(self.json_array()))) + } else if let Some(string) = try!(self.consume_string_literal()) { + Ok(Some(JsonValue::String(string))) + } else { + if self.consume("true") { + Ok(Some(JsonValue::True)) + } else if self.consume("false") { + Ok(Some(JsonValue::False)) + } else if self.consume("null") { + Ok(Some(JsonValue::Null)) + } else if let Some(num) = try!(self.consume_number()) { + Ok(Some(JsonValue::Number(num))) + } else { + Ok(None) + } + } + } - fn write_result(&self, results: &mut VecDeque>, - write: &mut Write) -> Result<(), Error>; + fn json_object(&mut self) -> Result { + try!(self.must_consume("{")); + let mut object = Vec::new(); + if self.consume("}") { + return Ok(JsonValue::Object(object)); + } + loop { + if let Some(field) = try!(self.consume_key()) { + try!(self.must_consume(":")); + if let Some(json) = try!(self.json()) { + object.push((field, json)); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } + try!(self.must_consume("}")); + Ok(JsonValue::Object(object)) + } + + fn json_array(&mut self) -> Result { + try!(self.must_consume("[")); + let mut array = Vec::new(); + if self.consume("]") { + return Ok(JsonValue::Array(array)); + } + loop { + if let Some(json) = try!(self.json()) { + array.push(json); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } + try!(self.must_consume("]")); + Ok(JsonValue::Array(array)) + } + + fn build_filter(&mut self) -> Result, Error> { + self.ws(); + Ok(try!(self.find())) + } +} + + + +#[derive(PartialEq, Eq, Clone)] +pub enum AggregateFun { + GroupAsc, + GroupDesc, + Sum, + Max, + Min, + List, + Concat, + Avg, + Count, } +struct AggregateFunImpls { + init: Option, + action: fn (&mut JsonValue, JsonValue, &JsonValue), + extract: Option, +} + +impl AggregateFun { + fn get_fun_impls(&self) -> AggregateFunImpls { + match self { + &AggregateFun::GroupAsc => panic!("cannot get aggregate fun for grouping!"), + &AggregateFun::GroupDesc => panic!("cannot get aggregate fun for grouping!"), + &AggregateFun::Sum => AggregateFunImpls{ + init: Some(AggregateFun::sum_init), + action: AggregateFun::sum, + extract: None, + }, + &AggregateFun::Max => AggregateFunImpls{ + init: None, + action: AggregateFun::max, + extract: None, + }, + &AggregateFun::Min => AggregateFunImpls{ + init: None, + action: AggregateFun::min, + extract: None, + }, + &AggregateFun::List => AggregateFunImpls{ + init: Some(AggregateFun::list_init), + action: AggregateFun::list, + extract: None, + }, + &AggregateFun::Concat => AggregateFunImpls{ + init: Some(AggregateFun::concat_init), + action: AggregateFun::concat, + extract: None, + }, + &AggregateFun::Avg => AggregateFunImpls{ + init: Some(AggregateFun::avg_init), + action: AggregateFun::avg, + extract: Some(AggregateFun::avg_final), + }, + &AggregateFun::Count => AggregateFunImpls{ + init: Some(AggregateFun::count_init), + action: AggregateFun::count, + extract: None, + }, + } + } + + fn sum_init(existing: &mut JsonValue) { + if let &mut JsonValue::Number(_) = existing { + //do nothing + } else { + *existing = JsonValue::Number(0.0) + } + } + + fn sum(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + match (existing, new) { + (&mut JsonValue::Number(ref mut existing), JsonValue::Number(new)) => { + *existing += new; + }, + _ => (), + } + } + + fn max(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if *existing < new { + *existing = new + } + } + + fn min(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if *existing > new { + *existing = new + } + } + + fn list_init(existing: &mut JsonValue) { + *existing = JsonValue::Array(vec![existing.clone()]); + } + + fn list(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let &mut JsonValue::Array(ref mut existing) = existing { + existing.push(new); + } + } + + fn concat_init(existing: &mut JsonValue) { + if let &mut JsonValue::String(ref _string) = existing { + // do nothing + } else { + JsonValue::String(String::new()); + } + } + + fn concat(existing: &mut JsonValue, new: JsonValue, user_arg: &JsonValue) { + if let &mut JsonValue::String(ref mut existing) = existing { + if let JsonValue::String(new) = new { + if let &JsonValue::String(ref user_arg) = user_arg { + existing.push_str(&user_arg); + existing.push_str(&new); + } + } + } + } + + fn avg_init(existing: &mut JsonValue) { + let new = if let &mut JsonValue::Number(ref num) = existing { + JsonValue::Array(vec![JsonValue::Number(num.clone()), JsonValue::Number(1.0)]) + } else { + JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]) + }; + *existing = new; + } + + fn avg(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Number(new) = new { + if let &mut JsonValue::Array(ref mut array) = existing { + let mut avg = if let &JsonValue::Number(ref avg) = &array[0] { + *avg + } else { + // can't happen but compiler need this here + 1.0 + }; + + let mut count = if let &JsonValue::Number(ref count) = &array[1] { + *count + } else { + // can't happen but compiler need this here + 1.0 + }; + + avg = (avg * count + new) / (count + 1.0); + count += 1.0; + array[0] = JsonValue::Number(avg); + array[1] = JsonValue::Number(count); + } + } + } + + fn avg_final(existing: &mut JsonValue) { + let json = if let &mut JsonValue::Array(ref mut array) = existing { + if let &JsonValue::Number(ref avg) = &array[0] { + if let &JsonValue::Number(ref count) = &array[1] { + if *count == 0.0 { + JsonValue::Null + } else { + JsonValue::Number(*avg) + } + } else { + // can't happen but compiler need this here + JsonValue::Null + } + } else { + // can't happen but compiler need this here + JsonValue::Null + } + } else { + // can't happen but compiler need this here + JsonValue::Null + }; + *existing = json + } + + fn count_init(existing: &mut JsonValue) { + *existing = JsonValue::Number(1.0); + } + + fn count(existing: &mut JsonValue, _: JsonValue, _user_arg: &JsonValue) { + if let &mut JsonValue::Number(ref mut num) = existing { + *num += 1.0; + } + } +} + +#[derive(PartialEq, PartialOrd, Clone, Debug)] pub enum JsonValue { Number(f64), String(String), @@ -687,9 +1640,102 @@ impl JsonValue { ret } + fn cmp_always_equal(_a: &JsonValue, _b: &JsonValue) -> Ordering { + Ordering::Equal + } + + fn cmp_f64(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Number(a_val) = a { + if let &JsonValue::Number(b_val) = b { + if a_val < b_val { + Ordering::Less + } else if a_val > b_val { + Ordering::Greater + } else { + Ordering::Equal + } + } else { + panic!("cast error in cmp_f64"); + } + } else { + panic!("cast error in cmp_f64"); + } + } + + fn cmp_string(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::String(ref a_val) = a { + if let &JsonValue::String(ref b_val) = b { + // Note we eventually want to switch to a collation library like ICU + a_val.cmp(&b_val) + } else { + panic!("cast error in cmp_string"); + } + } else { + panic!("cast error in cmp_string"); + } + } + + fn cmp_array(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Array(ref a_val) = a { + if let &JsonValue::Array(ref b_val) = b { + for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { + let order = a_el.cmp(&b_el); + if order != Ordering::Equal { + return order; + } + } + // if we got here all elements were equal. But one array might be longer + // so sort it last + a_val.len().cmp(&b_val.len()) + } else { + panic!("cast error in cmp_array"); + } + } else { + panic!("cast error in cmp_array"); + } + } + + fn cmp_object(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Object(ref a_val) = a { + if let &JsonValue::Object(ref b_val) = b { + for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { + // compare key + let mut order = a_el.0.cmp(&b_el.0); + if order != Ordering::Equal { + return order; + } + // compare value + order = a_el.1.cmp(&b_el.1); + if order != Ordering::Equal { + return order; + } + } + // if we got here all elements were equal. But one object might be longer + // so sort it last + a_val.len().cmp(&b_val.len()) + } else { + panic!("cast error in cmp_object"); + } + } else { + panic!("cast error in cmp_object"); + } + } + + fn type_sort_order(&self) -> (usize, fn(&JsonValue, &JsonValue) -> Ordering) { + match self { + &JsonValue::Null => (0, JsonValue::cmp_always_equal), + &JsonValue::False => (1, JsonValue::cmp_always_equal), + &JsonValue::True => (2, JsonValue::cmp_always_equal), + &JsonValue::Number(_) => (3, JsonValue::cmp_f64), + &JsonValue::String(_) => (4, JsonValue::cmp_string), + &JsonValue::Array(_) => (5, JsonValue::cmp_array), + &JsonValue::Object(_) => (6, JsonValue::cmp_object), + } + } + fn render(&self, write: &mut Write) -> Result<(), Error> { match self { - &JsonValue::Number(ref num) => try!(write.write_all(num.to_string().as_bytes())), + &JsonValue::Number(ref num) => try!(write.write_all(num.to_string().as_bytes())), &JsonValue::String(ref string) => { try!(write.write_all(JsonValue::str_to_literal(&string).as_bytes())) }, @@ -735,21 +1781,68 @@ impl JsonValue { } } -pub struct RetObject { +impl Eq for JsonValue {} + +impl Ord for JsonValue { + fn cmp(&self, other: &JsonValue) -> Ordering { + let (self_order_num, self_cmp_fun) = self.type_sort_order(); + let (other_order_num, _other_cmp_fun) = other.type_sort_order(); + match self_order_num.cmp(&other_order_num) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => self_cmp_fun(self, other), + } + } +} + +trait Returnable { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + bind_var_keys: &HashMap, + result: &mut VecDeque) -> Result<(), Error>; + + fn get_aggregate_funs(&self, funs: &mut Vec>); + + fn get_sorting(&self, sorts: &mut Vec>); + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap); + + fn write_result(&self, results: &mut VecDeque, + write: &mut Write) -> Result<(), Error>; +} + +struct RetObject { fields: Vec<(String, Box)>, } impl Returnable for RetObject { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, bind_var_keys: &HashMap, - result: &mut VecDeque>) -> Result<(), Error> { + result: &mut VecDeque) -> Result<(), Error> { for &(ref _key, ref field) in self.fields.iter() { try!(field.fetch_result(iter, seq, bind_var_keys, result)); } Ok(()) } - fn write_result(&self, results: &mut VecDeque>, + fn get_aggregate_funs(&self, funs: &mut Vec>) { + for &(ref _key, ref field) in self.fields.iter() { + field.get_aggregate_funs(funs); + } + } + + fn get_sorting(&self, sorts: &mut Vec>) { + for &(ref _key, ref field) in self.fields.iter() { + field.get_sorting(sorts); + } + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + for &mut (ref _key, ref mut field) in self.fields.iter_mut() { + field.take_sort_for_matching_fields(map); + } + } + + fn write_result(&self, results: &mut VecDeque, write: &mut Write) -> Result<(), Error> { try!(write.write_all("{".as_bytes())); let mut iter = self.fields.iter().peekable(); @@ -772,21 +1865,39 @@ impl Returnable for RetObject { } -pub struct RetArray { +struct RetArray { slots: Vec>, } impl Returnable for RetArray { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, bind_var_keys: &HashMap, - result: &mut VecDeque>) -> Result<(), Error> { - for ref mut slot in self.slots.iter() { + result: &mut VecDeque) -> Result<(), Error> { + for ref slot in self.slots.iter() { try!(slot.fetch_result(iter, seq, bind_var_keys, result)); } Ok(()) } - fn write_result(&self, results: &mut VecDeque>, + fn get_aggregate_funs(&self, funs: &mut Vec>) { + for ref slot in self.slots.iter() { + slot.get_aggregate_funs(funs); + } + } + + fn get_sorting(&self, sorts: &mut Vec>) { + for ref slot in self.slots.iter() { + slot.get_sorting(sorts); + } + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + for slot in self.slots.iter_mut() { + slot.take_sort_for_matching_fields(map); + } + } + + fn write_result(&self, results: &mut VecDeque, write: &mut Write) -> Result<(), Error> { try!(write.write_all("[".as_bytes())); @@ -805,8 +1916,95 @@ impl Returnable for RetArray { } } +#[derive(PartialEq, Eq, Clone)] +enum Sort { + Asc, + Desc, +} + +struct SortInfo { + kb: KeyBuilder, + sort: Sort, + default: JsonValue, +} + +struct RetHidden { + unrendered: Vec, + visible: Box, +} + +impl Returnable for RetHidden { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + bind_var_keys: &HashMap, + result: &mut VecDeque) -> Result<(), Error> { + for ref mut unrendered in self.unrendered.iter() { + try!(unrendered.fetch_result(iter, seq, bind_var_keys, result)); + } + + self.visible.fetch_result(iter, seq, bind_var_keys, result) + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + self.visible.get_aggregate_funs(funs); + } + + fn get_sorting(&self, sorts: &mut Vec>) { + for ref mut unrendered in self.unrendered.iter() { + unrendered.get_sorting(sorts); + } + + self.visible.get_sorting(sorts); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + self.visible.take_sort_for_matching_fields(map); + } + + fn write_result(&self, results: &mut VecDeque, + write: &mut Write) -> Result<(), Error> { + for _n in 0..self.unrendered.len() { + // we already sorted at this point, now discard the values + results.pop_front(); + } + self.visible.write_result(results, write) + } +} + +struct RetLiteral { + json: JsonValue, +} + +impl Returnable for RetLiteral { + fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, + _bind_var_keys: &HashMap, + _result: &mut VecDeque) -> Result<(), Error> { + Ok(()) + } + + fn get_aggregate_funs(&self, _funs: &mut Vec>) { + //noop + } + + fn get_sorting(&self, _sorts: &mut Vec>) { + //noop + } + + fn take_sort_for_matching_fields(&mut self, _map: &mut HashMap) { + //noop + } + + fn write_result(&self, _results: &mut VecDeque, + write: &mut Write) -> Result<(), Error> { + + self.json.render(write) + } +} + pub struct RetValue { kb: KeyBuilder, + ag: Option<(AggregateFun, JsonValue)>, + default: JsonValue, + sort: Option, } impl RetValue { @@ -947,7 +2145,12 @@ impl RetValue { impl Returnable for RetValue { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, bind_var_keys: &HashMap, - result: &mut VecDeque>) -> Result<(), Error> { + result: &mut VecDeque) -> Result<(), Error> { + if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { + //don't fetch anything for count(). just stick in a null + result.push_back(JsonValue::Null); + return Ok(()); + } let value_key = if self.kb.keypath_segments_len() == 1 { let key = self.kb.peek_object_key(); if let Some(value_key) = bind_var_keys.get(&key) { @@ -966,31 +2169,40 @@ impl Returnable for RetValue { let (key, value) = match iter.next() { Some((key, value)) => (key, value), None => { - result.push_back(None); + result.push_back(self.default.clone()); return Ok(()) }, }; if !key.starts_with(value_key.as_bytes()) { - result.push_back(None); + result.push_back(self.default.clone()); return Ok(()); } let json_value = try!(RetValue::fetch(&mut iter.peekable(), &value_key, key, value)); - result.push_back(Some(json_value)); + result.push_back(json_value); Ok(()) } - fn write_result(&self, results: &mut VecDeque>, + fn get_aggregate_funs(&self, funs: &mut Vec>) { + funs.push(self.ag.clone()); + } + + fn get_sorting(&self, sorts: &mut Vec>) { + sorts.push(self.sort.clone()); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + if let Some(sort_info) = map.remove(&self.kb.value_key(0)) { + self.sort = Some(sort_info.sort); + } + } + + fn write_result(&self, results: &mut VecDeque, write: &mut Write) -> Result<(), Error> { - if let Some(option) = results.pop_front() { - if let Some(json) = option { - try!(json.render(write)); - } else { - // for now just output a Null when we found nothing - try!(JsonValue::Null.render(write)); - } + if let Some(json) = results.pop_front() { + try!(json.render(write)); } else { panic!("missing result!"); } @@ -1000,31 +2212,16 @@ impl Returnable for RetValue { -impl Query { - pub fn get_matches<'a>(query: String, index: &'a Index) -> Result, Error> { - match index.rocks { - Some(ref rocks) => { - let snapshot = Snapshot::new(&rocks); - let mut parser = Parser::new(query, snapshot); - let filter = try!(parser.build_filter()); - let returnable = try!(parser.return_clause()); - Ok(QueryResults::new(filter, parser.snapshot, returnable)) - }, - None => { - Err(Error::Parse("You must open the index first".to_string())) - }, - } - } -} - #[cfg(test)] mod tests { + extern crate rustc_serialize; + use super::{Parser, Query}; use index::{Index, OpenOptions}; use rocksdb::Snapshot; - + #[test] fn test_whitespace() { let mut index = Index::new(); @@ -1194,13 +2391,332 @@ mod tests { return {foo:.A[0], bar: ._id} "#.to_string(), &index).unwrap(); assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"1","bar":"12"}"#.to_string())); assert_eq!(query_results.next_result().unwrap(), None); + query_results = Query::get_matches(r#"find {A:[ == "foo"]} return .A "#.to_string(), &index).unwrap(); assert_eq!(query_results.next_result().unwrap(),Some(r#"["foo",1,true,false,null,{},[]]"#.to_string())); assert_eq!(query_results.next_result().unwrap(), None); + + + query_results = Query::get_matches(r#"find {A:[ == "foo"]} + return .B "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + query_results = Query::get_matches(r#"find {A:[ == "foo"]} + return .B default={foo:"foo"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"foo"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + query_results = Query::get_matches(r#"find {A:[ == "foo"]} + return .B default={}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + query_results = Query::get_matches(r#"find {A:[ == "foo"]} + return {foo: .B default={bar:"bar"}}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":{"bar":"bar"}}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[ == "foo"]} + return {"a":"a","b":1.123,"true":true,"false":false,"null":null,array:[],object:{}}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(), + Some(r#"{"a":"a","b":1.123,"true":true,"false":false,"null":null,"array":[],"object":{}}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + #[test] + fn test_query_group() { + let dbname = "target/tests/querytestgroup"; + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + + let _ = index.add(r#"{"_id":"1", "foo":"group", "baz": "a", "bar": 1}"#); + let _ = index.add(r#"{"_id":"2", "foo":"group", "baz": "b", "bar": 2}"#); + let _ = index.add(r#"{"_id":"3", "foo":"group", "baz": "c", "bar": 3}"#); + let _ = index.add(r#"{"_id":"4", "foo":"group", "baz": "a", "bar": 1}"#); + let _ = index.add(r#"{"_id":"5", "foo":"group", "baz": "b", "bar": 2}"#); + let _ = index.add(r#"{"_id":"6", "foo":"group", "baz": "c", "bar": 3}"#); + let _ = index.add(r#"{"_id":"7", "foo":"group", "baz": "a", "bar": 1}"#); + let _ = index.add(r#"{"_id":"8", "foo":"group", "baz": "b", "bar": 2}"#); + let _ = index.add(r#"{"_id":"9", "foo":"group", "baz": "c", "bar": 3}"#); + let _ = index.add(r#"{"_id":"10", "foo":"group", "baz": "a", "bar": 1}"#); + let _ = index.add(r#"{"_id":"11", "foo":"group", "baz": "b", "bar": 2}"#); + let _ = index.add(r#"{"_id":"12", "foo":"group", "baz": "c", "bar": 3}"#); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {baz: group(.baz), bar: sum(.bar)}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","bar":4}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","bar":8}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","bar":12}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {bar: sum(.bar)}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":24}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {bar: avg(.bar)}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":2}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {baz: group(.baz), concat: concat(.baz sep="|")}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","concat":"a|a|a|a"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","concat":"b|b|b|b"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","concat":"c|c|c|c"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {baz: group(.baz), list: list(.baz)}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","list":["a","a","a","a"]}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","list":["b","b","b","b"]}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","list":["c","c","c","c"]}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {baz: group(.baz), count: count()}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","count":4}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","count":4}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","count":4}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {max: max(.bar)}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"max":3}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {min: min(.bar)}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"min":1}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + + let mut query_results = Query::get_matches(r#"find {foo: =="group"} + return {max: max(.baz)}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"max":"c"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + let _ = index.add(r#"{"_id":"1", "foo":"group2", "baz": "a", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"2", "foo":"group2", "baz": "a", "bar": "b"}"#); + let _ = index.add(r#"{"_id":"3", "foo":"group2", "baz": "b", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"4", "foo":"group2", "baz": "b", "bar": "b"}"#); + let _ = index.add(r#"{"_id":"5", "foo":"group2", "baz": "a", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"6", "foo":"group2", "baz": "a", "bar": "c"}"#); + let _ = index.add(r#"{"_id":"7", "foo":"group2", "baz": "b", "bar": "d"}"#); + let _ = index.add(r#"{"_id":"8", "foo":"group2", "baz": "b", "bar": "e"}"#); + let _ = index.add(r#"{"_id":"9", "foo":"group2", "baz": "a", "bar": "f"}"#); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {foo: =="group2"} + return [group(.baz order=asc), group(.bar order=desc), count()]"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","b",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","a",2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","e",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","d",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","b",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","a",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {foo: =="group2"} + return [group(.baz order=asc), group(.bar order=desc), count()] + limit 2"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + let _ = index.add(r#"{"_id":"1", "foo":"group3", "baz": "a", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"2", "foo":"group3", "bar": "b"}"#); + let _ = index.add(r#"{"_id":"3", "foo":"group3", "baz": "b", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"4", "foo":"group3", "baz": "b", "bar": "b"}"#); + let _ = index.add(r#"{"_id":"5", "foo":"group3", "baz": "a", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"6", "foo":"group3", "baz": "a", }"#); + let _ = index.add(r#"{"_id":"7", "foo":"group3", "baz": "b", "bar": "d"}"#); + let _ = index.add(r#"{"_id":"8", "foo":"group3", "baz": "b", "bar": "e"}"#); + let _ = index.add(r#"{"_id":"9", "foo":"group3", "baz": "a", "bar": "f"}"#); + + index.flush().unwrap(); + + + let mut query_results = Query::get_matches(r#"find {foo: =="group2"} + return [group(.baz order=asc) default="a", group(.bar order=desc) default="c", count()]"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","b",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","a",2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","e",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","d",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","b",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","a",1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + + } + + + #[test] + fn test_query_json_collation() { + let dbname = "target/tests/querytestjsoncollation"; + + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + + assert_eq!(Ok(()), index.add(r#"{"_id":"1", "foo":"coll", "bar": {}}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"2", "foo":"coll", "bar": {"foo":"bar"}}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"3", "foo":"coll", "bar": {"foo":"baz"}}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"4", "foo":"coll", "bar": {"foo":"baz","bar":"baz"}}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"5", "foo":"coll", "bar": {"foo":"baz","bar":"bar"}}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"6", "foo":"coll", "bar": 1}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"7", "foo":"coll", "bar": 1.00001}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"8", "foo":"coll", "bar": 2.00001}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"9", "foo":"coll", "bar": true}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"10", "foo":"coll", "bar": false}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"11", "foo":"coll", "bar": null}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"12", "foo":"coll", "bar": []}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"13", "foo":"coll", "bar": [true]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"14", "foo":"coll", "bar": [null]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"15", "foo":"coll", "bar": "string"}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"16", "foo":"coll", "bar": "string2"}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"17", "foo":"coll", "bar": "string3"}"#)); + + index.flush().unwrap(); + + + { + let mut query_results = Query::get_matches(r#"find {foo: =="coll"} + sort .bar asc + return .bar "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"false"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"true"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"1"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"1.00001"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"2.00001"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""string""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""string2""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""string3""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[null]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[true]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":"bar","foo":"baz"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":"baz","foo":"baz"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"bar"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"baz"}"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {foo: =="coll"} + sort .bar asc + return .bar + limit 5"#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"false"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"true"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"1"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"1.00001"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {foo: =="coll"} + sort .bar asc + return .bar + limit 1"#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + assert_eq!(Ok(()), index.add(r#"{"_id":"20", "foo":"coll2", "bar":[1,1,1]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"21", "foo":"coll2", "bar":[1,1,2]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"22", "foo":"coll2", "bar":[1,2,2]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"23", "foo":"coll2", "bar":[2,2,2]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"24", "foo":"coll2", "bar":[2,1,1]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"25", "foo":"coll2", "bar":[2,1,2]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"26", "foo":"coll2", "bar":[2,3,2]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"27", "foo":"coll2", "bar":[3,4,3]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"28", "foo":"coll2", "bar":[5,4,3]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"29", "foo":"coll2", "bar":[5,5,5]}"#)); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {foo: =="coll2"} + sort .bar[0] asc, .bar[1] desc, .bar[2] desc + return [.bar[0], .bar[1], .bar[2]] "#.to_string(), &index).unwrap(); + + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,2,2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,1,2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,1,1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,3,2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,2,2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,1,2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,1,1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[3,4,3]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[5,5,5]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"[5,4,3]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + + let _ = index.add(r#"{"_id":"1", "foo":"group2", "baz": "a", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"2", "foo":"group2", "baz": "a", "bar": "b"}"#); + let _ = index.add(r#"{"_id":"3", "foo":"group2", "baz": "b", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"4", "foo":"group2", "baz": "b", "bar": "b"}"#); + let _ = index.add(r#"{"_id":"5", "foo":"group2", "baz": "a", "bar": "a"}"#); + let _ = index.add(r#"{"_id":"6", "foo":"group2", "baz": "a", "bar": "c"}"#); + let _ = index.add(r#"{"_id":"7", "foo":"group2", "baz": "b", "bar": "d"}"#); + let _ = index.add(r#"{"_id":"8", "foo":"group2", "baz": "b", "bar": "e"}"#); + let _ = index.add(r#"{"_id":"9", "foo":"group2", "baz": "a", "bar": "f"}"#); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {foo: =="group2"} + sort .baz asc, .bar desc + return [.baz, .bar] + limit 2"#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f"]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c"]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + } + + #[test] fn test_query_more_docs() { let dbname = "target/tests/querytestdbmoredocs"; From 165fcf05e39ed4bb5e334730be838747f34abff6 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 31 Dec 2016 09:56:25 -0800 Subject: [PATCH 051/122] Moved Parser and JsonValue out of query.rs and into separate files --- src/json_value.rs | 188 +++++++ src/lib.rs | 2 + src/parser.rs | 998 +++++++++++++++++++++++++++++++++++++ src/query.rs | 1196 +-------------------------------------------- 4 files changed, 1212 insertions(+), 1172 deletions(-) create mode 100644 src/json_value.rs create mode 100644 src/parser.rs diff --git a/src/json_value.rs b/src/json_value.rs new file mode 100644 index 0000000..e97c24a --- /dev/null +++ b/src/json_value.rs @@ -0,0 +1,188 @@ + +use std::str; +use std::cmp::Ordering; +use std::io::Write; + +use error::Error; + + + +#[derive(PartialEq, PartialOrd, Clone, Debug)] +pub enum JsonValue { + Number(f64), + String(String), + Array(Vec), + Object(Vec<(String, JsonValue)>), + True, + False, + Null, +} + +impl JsonValue { + pub fn str_to_literal(string: &str) ->String { + let mut ret = String::with_capacity(string.len()*2+2); + ret.push('"'); + for c in string.chars() { + if c == '"' || c == '\\' { + ret.push('\\'); + } + ret.push(c); + } + ret.push('"'); + ret + } + + fn cmp_always_equal(_a: &JsonValue, _b: &JsonValue) -> Ordering { + Ordering::Equal + } + + fn cmp_f64(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Number(a_val) = a { + if let &JsonValue::Number(b_val) = b { + if a_val < b_val { + Ordering::Less + } else if a_val > b_val { + Ordering::Greater + } else { + Ordering::Equal + } + } else { + panic!("cast error in cmp_f64"); + } + } else { + panic!("cast error in cmp_f64"); + } + } + + fn cmp_string(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::String(ref a_val) = a { + if let &JsonValue::String(ref b_val) = b { + // Note we eventually want to switch to a collation library like ICU + a_val.cmp(&b_val) + } else { + panic!("cast error in cmp_string"); + } + } else { + panic!("cast error in cmp_string"); + } + } + + fn cmp_array(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Array(ref a_val) = a { + if let &JsonValue::Array(ref b_val) = b { + for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { + let order = a_el.cmp(&b_el); + if order != Ordering::Equal { + return order; + } + } + // if we got here all elements were equal. But one array might be longer + // so sort it last + a_val.len().cmp(&b_val.len()) + } else { + panic!("cast error in cmp_array"); + } + } else { + panic!("cast error in cmp_array"); + } + } + + fn cmp_object(a: &JsonValue, b: &JsonValue) -> Ordering { + if let &JsonValue::Object(ref a_val) = a { + if let &JsonValue::Object(ref b_val) = b { + for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { + // compare key + let mut order = a_el.0.cmp(&b_el.0); + if order != Ordering::Equal { + return order; + } + // compare value + order = a_el.1.cmp(&b_el.1); + if order != Ordering::Equal { + return order; + } + } + // if we got here all elements were equal. But one object might be longer + // so sort it last + a_val.len().cmp(&b_val.len()) + } else { + panic!("cast error in cmp_object"); + } + } else { + panic!("cast error in cmp_object"); + } + } + + fn type_sort_order(&self) -> (usize, fn(&JsonValue, &JsonValue) -> Ordering) { + match self { + &JsonValue::Null => (0, JsonValue::cmp_always_equal), + &JsonValue::False => (1, JsonValue::cmp_always_equal), + &JsonValue::True => (2, JsonValue::cmp_always_equal), + &JsonValue::Number(_) => (3, JsonValue::cmp_f64), + &JsonValue::String(_) => (4, JsonValue::cmp_string), + &JsonValue::Array(_) => (5, JsonValue::cmp_array), + &JsonValue::Object(_) => (6, JsonValue::cmp_object), + } + } + + pub fn render(&self, write: &mut Write) -> Result<(), Error> { + match self { + &JsonValue::Number(ref num) => try!(write.write_all(num.to_string().as_bytes())), + &JsonValue::String(ref string) => { + try!(write.write_all(JsonValue::str_to_literal(&string).as_bytes())) + }, + &JsonValue::Array(ref array) => { + try!(write.write_all("[".as_bytes())); + + let mut iter = array.iter().peekable(); + loop { + match iter.next() { + Some(json) => try!(json.render(write)), + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + } + try!(write.write_all("]".as_bytes())); + }, + &JsonValue::Object(ref object) => { + try!(write.write_all("{".as_bytes())); + + let mut iter = object.iter().peekable(); + loop { + match iter.next() { + Some(&(ref key, ref json)) => { + try!(write.write_all(JsonValue::str_to_literal(&key).as_bytes())); + try!(write.write_all(":".as_bytes())); + try!(json.render(write)); + } + None => break, + } + if iter.peek().is_some() { + try!(write.write_all(",".as_bytes())); + } + } + try!(write.write_all("}".as_bytes())); + }, + &JsonValue::True => try!(write.write_all("true".as_bytes())), + &JsonValue::False => try!(write.write_all("false".as_bytes())), + &JsonValue::Null => try!(write.write_all("null".as_bytes())), + } + Ok(()) + } +} + +impl Eq for JsonValue {} + +impl Ord for JsonValue { + fn cmp(&self, other: &JsonValue) -> Ordering { + let (self_order_num, self_cmp_fun) = self.type_sort_order(); + let (other_order_num, _other_cmp_fun) = other.type_sort_order(); + match self_order_num.cmp(&other_order_num) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => self_cmp_fun(self, other), + } + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 80dd5d7..85830c4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,9 @@ extern crate rocksdb; mod error; mod filters; mod json_shred; +mod json_value; mod key_builder; +mod parser; mod stems; pub mod index; pub mod query; diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..ab8a829 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,998 @@ + +use std::str; +use std::collections::HashMap; +use std::iter::Iterator; +use std::usize; + +use error::Error; +use key_builder::KeyBuilder; +use stems::Stems; +use json_value::JsonValue; +use query::{Sort, Returnable, RetValue, RetObject, RetArray, RetLiteral, AggregateFun, SortInfo}; +use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, + StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter}; + + +// TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs +use rocksdb::{IteratorMode, Snapshot}; + + +pub struct Parser<'a> { + query: String, + offset: usize, + kb: KeyBuilder, + pub snapshot: Snapshot<'a>, +} + +impl<'a> Parser<'a> { + pub fn new(query: String, snapshot: Snapshot<'a>) -> Parser<'a> { + Parser { + query: query, + offset: 0, + kb: KeyBuilder::new(), + snapshot: snapshot, + } + } + + fn ws(&mut self) { + for char in self.query[self.offset..].chars() { + if !char.is_whitespace() { + break; + } + self.offset += char.len_utf8(); + } + } + + fn consume(&mut self, token: &str) -> bool { + if self.could_consume(token) { + self.offset += token.len(); + self.ws(); + true + } else { + false + } + } + + + fn must_consume(&mut self, token: &str) -> Result<(), Error> { + if self.could_consume(token) { + self.offset += token.len(); + self.ws(); + Ok(()) + } else { + Err(Error::Parse(format!("Expected '{}' at character {}, found {}.", + token, self.offset, + &self.query[self.offset..self.offset+1]))) + } + } + + fn could_consume(&self, token: &str) -> bool { + self.query[self.offset..].starts_with(token) + } + + fn consume_key(&mut self) -> Result, Error> { + if let Some(key) = self.consume_field() { + Ok(Some(key)) + } else if let Some(key) = try!(self.consume_string_literal()) { + Ok(Some(key)) + } else { + Ok(None) + } + } + + fn consume_field(&mut self) -> Option { + let mut result = String::new(); + { + let mut chars = self.query[self.offset..].chars(); + if let Some(c) = chars.next() { + // first char cannot be numeric + if c.is_alphabetic() || '_' == c || '$' == c { + result.push(c); + for c in chars { + if c.is_alphanumeric() || '_' == c || '$' == c { + result.push(c); + } else { + break; + } + } + } + } + } + if result.len() > 0 { + self.offset += result.len(); + self.ws(); + Some(result) + } else { + None + } + } + + fn consume_integer(&mut self) -> Result, Error> { + let mut result = String::new(); + for char in self.query[self.offset..].chars() { + if char >= '0' && char <= '9' { + result.push(char); + } else { + break; + } + } + if !result.is_empty() { + self.offset += result.len(); + self.ws(); + Ok(Some(try!(result.parse()))) + } else { + Ok(None) + } + } + + fn consume_default(&mut self) -> Result, Error> { + if self.consume("default") { + try!(self.must_consume("=")); + if let Some(json) = try!(self.json()) { + Ok(Some(json)) + } else { + Err(Error::Parse("Expected json value for default".to_string())) + } + } else { + Ok(None) + } + } + + fn consume_aggregate(&mut self) -> Result, Error> { + let offset = self.offset; + let mut aggregate_fun = if self.consume("group") { + AggregateFun::GroupAsc + } else if self.consume("sum") { + AggregateFun::Sum + } else if self.consume("max") { + AggregateFun::Max + } else if self.consume("min") { + AggregateFun::Min + } else if self.consume("list") { + AggregateFun::List + } else if self.consume("concat") { + AggregateFun::Concat + } else if self.consume("avg") { + AggregateFun::Avg + } else if self.consume("count") { + AggregateFun::Count + } else { + return Ok(None) + }; + + if self.consume("(") { + if aggregate_fun == AggregateFun::Count { + try!(self.must_consume(")")); + Ok(Some((aggregate_fun, KeyBuilder::new(), JsonValue::Null))) + } else if aggregate_fun == AggregateFun::Concat { + if let Some(kb) = try!(self.consume_keypath()) { + let json = if self.consume("sep") { + try!(self.must_consume("=")); + JsonValue::String(try!(self.must_consume_string_literal())) + } else { + JsonValue::String(",".to_string()) + }; + try!(self.must_consume(")")); + Ok(Some((aggregate_fun, kb, json))) + } else { + Err(Error::Parse("Expected keypath or bind variable".to_string())) + } + } else if let Some(kb) = try!(self.consume_keypath()) { + if self.consume("order") { + try!(self.must_consume("=")); + if self.consume("asc") { + aggregate_fun = AggregateFun::GroupAsc; + } else if self.consume("desc") { + aggregate_fun = AggregateFun::GroupDesc; + } else { + return Err(Error::Parse("Expected asc or desc".to_string())); + } + } + try!(self.must_consume(")")); + + Ok(Some((aggregate_fun, kb, JsonValue::Null))) + } else { + Err(Error::Parse("Expected keypath or bind variable".to_string())) + } + } else { + // this consumed word above might be a Bind var. Unconsume and return nothing. + self.offset = offset; + Ok(None) + } + } + + fn consume_keypath(&mut self) -> Result, Error> { + let key: String = if self.consume(".") { + if self.consume("[") { + let key = try!(self.must_consume_string_literal()); + try!(self.must_consume("]")); + key + } else { + if let Some(key) = self.consume_field() { + key + } else { + self.ws(); + // this means return the whole document + return Ok(Some(KeyBuilder::new())); + } + } + } else { + return Ok(None); + }; + + let mut kb = KeyBuilder::new(); + kb.push_object_key(&key); + loop { + if self.consume("[") { + if let Some(index) = try!(self.consume_integer()) { + kb.push_array_index(index as u64); + } else { + return Err(Error::Parse("Expected array index integer.".to_string())); + } + try!(self.must_consume("]")); + } else if self.consume(".") { + if let Some(key) = self.consume_field() { + kb.push_object_key(&key); + } else { + return Err(Error::Parse("Expected object key.".to_string())); + } + } else { + break; + } + } + self.ws(); + Ok(Some(kb)) + } + + fn consume_number(&mut self) -> Result, Error> { + // Yes this parsing code is hideously verbose. But it conforms exactly to the json spec + // and uses the rust f64 parser, which can't tell us how many characters it used or needs. + + // At the end it then uses the std rust String::parse() method to parse and return + // the f64 value and advance the self.offset. The rust method is a super set of the + // allowable json syntax, so it will parse any valid json floating point number. It might + // return an error if the number is out of bounds. + let mut result = String::new(); + 'outer: loop { + // this loop isn't a loop, it's just there to scope the self borrow + // and then jump to the end to do another borrow (self.ws()) + let mut chars = self.query[self.offset..].chars(); + let mut c = if let Some(c) = chars.next() { + c + } else { + return Ok(None); + }; + + // parse the sign + c = if c == '-' { + result.push('-'); + if let Some(c) = chars.next() { c } else {return Ok(None); } + } else { + c + }; + + // parse the first digit + let mut leading_zero = false; + c = if c == '0' { + result.push('0'); + leading_zero = true; + if let Some(c) = chars.next() { c } else {return Ok(None); } + } else if c >= '1' && c <= '9' { + result.push(c); + if let Some(c) = chars.next() { c } else {return Ok(None); } + } else if result.is_empty() { + // no sign or digits found. not a number + return Ok(None); + } else { + return Err(Error::Parse("Expected digits after sign (-).".to_string())); + }; + + // parse remaning significant digits + if !leading_zero { + // no more digits allowed if first digit is zero + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + break; + }; + } + } + + // parse decimal + c = if c == '.' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected digits after decimal point.".to_string())); + } + } else { + break 'outer; + }; + + // parse mantissa + let mut found_mantissa = false; + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + found_mantissa = true; + + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + if found_mantissa { + break; + } + return Err(Error::Parse("Expected digits after decimal point.".to_string())); + }; + } + + // parse exponent symbol + c = if c == 'e' || c == 'E' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } else { + break 'outer; + }; + + // parse exponent sign + c = if c == '+' || c == '-' { + result.push(c); + if let Some(c) = chars.next() { + c + } else { + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } else { + c + }; + + // parse exponent digits + let mut found_exponent = false; + loop { + c = if c >= '0' && c <= '9' { + result.push(c); + found_exponent = true; + if let Some(c) = chars.next() { + c + } else { + break 'outer; + } + } else { + if found_exponent { + break 'outer; + } + return Err(Error::Parse("Expected exponent after e.".to_string())); + } + } + } + + self.offset += result.len(); + self.ws(); + Ok(Some(try!(result.parse()))) + } + + + fn must_consume_string_literal(&mut self) -> Result { + if let Some(string) = try!(self.consume_string_literal()) { + Ok(string) + } else { + Err(Error::Parse("Expected string literal.".to_string())) + } + } + + fn consume_string_literal(&mut self) -> Result, Error> { + let mut lit = String::new(); + if !self.could_consume("\"") { + return Ok(None); + } + // can't consume("\"") the leading quote because it will also skip leading whitespace + // inside the string literal + self.offset += 1; + { + let mut chars = self.query[self.offset..].chars(); + 'outer: loop { + let char = if let Some(char) = chars.next() { + char + } else { + break; + }; + if char == '\\' { + self.offset += 1; + + let char = if let Some(char) = chars.next() { + char + } else { + break; + }; + match char { + '\\' | '"' | '/' => lit.push(char), + 'n' => lit.push('\n'), + 'b' => lit.push('\x08'), + 'r' => lit.push('\r'), + 'f' => lit.push('\x0C'), + 't' => lit.push('\t'), + 'v' => lit.push('\x0B'), + 'u' => { + let mut n = 0; + for _i in 0..4 { + let char = if let Some(char) = chars.next() { + char + } else { + break 'outer; + }; + n = match char { + c @ '0' ... '9' => n * 16 + ((c as u16) - ('0' as u16)), + c @ 'a' ... 'f' => n * 16 + (10 + (c as u16) - ('a' as u16)), + c @ 'A' ... 'F' => n * 16 + (10 + (c as u16) - ('A' as u16)), + _ => return Err(Error::Parse(format!( + "Invalid hexidecimal escape: {}", char))), + }; + + } + self.offset += 3; // 3 because 1 is always added after the match below + }, + _ => return Err(Error::Parse(format!("Unknown character escape: {}", + char))), + }; + self.offset += 1; + } else { + if char == '"' { + break; + } else { + lit.push(char); + self.offset += char.len_utf8(); + } + } + } + } + try!(self.must_consume("\"")); + Ok(Some(lit)) + } + +/* + +find + = "find" ws object ws + +object + = "{" ws obool ws "}" ws (("&&" / "||") ws object)? + / parens + +parens + = "(" ws object ws ")" + +obool + = ws ocompare ws (('&&' / ',' / '||') ws obool)? + +ocompare + = oparens + / key ws ":" ws (oparens / compare) + +oparens + = '(' ws obool ws ')' ws + / array + / object + +compare + = ("==" / "~=" / "~" digits "=" ) ws string ws + +abool + = ws acompare ws (('&&'/ ',' / '||') ws abool)? + +acompare + = aparens + / compare + +aparens + = '(' ws abool ')' ws + / array + / object + +array + = '[' ws abool ']' ws + +key + = field / string + +field + = [a-z_$]i [a-z_$0-9]i* + +string + = '"' ('\\\\' / '\\' [\"tfvrnb] / [^\\\"])* '"' ws + +digits + = [0-9]+ + +ws + = [ \t\n\r]* + +ws1 + = [ \t\n\r]+ +*/ + + + fn find<'b>(&'b mut self) -> Result, Error> { + if !self.consume("find") { + return Err(Error::Parse("Missing 'find' keyword".to_string())); + } + self.object() + } + + fn object<'b>(&'b mut self) -> Result, Error> { + if self.consume("{") { + let left = try!(self.obool()); + try!(self.must_consume("}")); + + if self.consume("&&") { + let right = try!(self.object()); + Ok(Box::new(AndFilter::new(vec![left, right], self.kb.arraypath_len()))) + + } else if self.consume("||") { + let right = try!(self.object()); + Ok(Box::new(OrFilter::new(left, right, self.kb.arraypath_len()))) + } else { + Ok(left) + } + } else { + self.parens() + } + } + + fn parens<'b>(&'b mut self) -> Result, Error> { + try!(self.must_consume("(")); + let filter = try!(self.object()); + try!(self.must_consume(")")); + Ok(filter) + } + + fn obool<'b>(&'b mut self) -> Result, Error> { + let mut filter = try!(self.ocompare()); + loop { + filter = if self.consume("&&") || self.consume(",") { + let right = try!(self.obool()); + Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) + } else if self.consume("||") { + let right = try!(self.obool()); + Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) + } else { + break; + } + } + Ok(filter) + } + + fn ocompare<'b>(&'b mut self) -> Result, Error> { + if let Some(filter) = try!(self.oparens()) { + Ok(filter) + } else if let Some(field) = try!(self.consume_key()) { + self.kb.push_object_key(&field); + try!(self.must_consume(":")); + if let Some(filter) = try!(self.oparens()) { + self.kb.pop_object_key(); + Ok(filter) + } else { + let filter = try!(self.compare()); + self.kb.pop_object_key(); + Ok(filter) + } + } else { + Err(Error::Parse("Expected object key or '('".to_string())) + } + } + + fn oparens<'b>(&'b mut self) -> Result>, Error> { + if self.consume("(") { + let f = try!(self.obool()); + try!(self.must_consume(")")); + Ok(Some(f)) + } else if self.could_consume("[") { + Ok(Some(try!(self.array()))) + } else if self.could_consume("{") { + Ok(Some(try!(self.object()))) + } else { + Ok(None) + } + } + + fn compare<'b>(&'b mut self) -> Result, Error> { + if self.consume("==") { + let literal = try!(self.must_consume_string_literal()); + let stems = Stems::new(&literal); + let mut filters: Vec> = Vec::new(); + for stem in stems { + let iter = self.snapshot.iterator(IteratorMode::Start); + let filter = Box::new(ExactMatchFilter::new( + iter, &stem, &self.kb)); + filters.push(filter); + } + match filters.len() { + 0 => panic!("Cannot create a ExactMatchFilter"), + 1 => Ok(filters.pop().unwrap()), + _ => Ok(Box::new(AndFilter::new(filters, self.kb.arraypath_len()))), + } + } else if self.consume("~=") { + // regular search + let literal = try!(self.must_consume_string_literal()); + let stems = Stems::new(&literal); + let stemmed_words: Vec = stems.map(|stem| stem.stemmed).collect(); + + match stemmed_words.len() { + 0 => panic!("Cannot create a StemmedWordFilter"), + 1 => { + let iter = self.snapshot.iterator(IteratorMode::Start); + Ok(Box::new(StemmedWordFilter::new(iter, &stemmed_words[0], &self.kb))) + }, + _ => { + let mut filters: Vec = Vec::new(); + for stemmed_word in stemmed_words { + let iter = self.snapshot.iterator(IteratorMode::Start); + let filter = StemmedWordPosFilter::new(iter, &stemmed_word, &self.kb); + filters.push(filter); + } + Ok(Box::new(StemmedPhraseFilter::new(filters))) + }, + } + } else if self.consume("~") { + let word_distance = match try!(self.consume_integer()) { + Some(int) => int, + None => { + return Err(Error::Parse("Expected integer for proximity search".to_string())); + }, + }; + try!(self.must_consume("=")); + + let literal = try!(self.must_consume_string_literal()); + let stems = Stems::new(&literal); + let mut filters: Vec = Vec::new(); + for stem in stems { + let iter = self.snapshot.iterator(IteratorMode::Start); + let filter = StemmedWordPosFilter::new( + iter, &stem.stemmed, &self.kb); + filters.push(filter); + } + match filters.len() { + 0 => panic!("Cannot create a DistanceFilter"), + _ => Ok(Box::new(DistanceFilter::new(filters, word_distance))), + } + } else { + Err(Error::Parse("Expected comparison operator".to_string())) + } + } + + fn abool<'b>(&'b mut self) -> Result, Error> { + let mut filter = try!(self.acompare()); + loop { + filter = if self.consume("&&") || self.consume(",") { + let right = try!(self.abool()); + Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) + } else if self.consume("||") { + let right = try!(self.abool()); + Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) + } else { + break; + } + } + Ok(filter) + } + + fn acompare<'b>(&'b mut self) -> Result, Error> { + if let Some(filter) = try!(self.aparens()) { + Ok(filter) + } else { + self.compare() + } + } + + fn aparens<'b>(&'b mut self) -> Result>, Error> { + if self.consume("(") { + let f = try!(self.abool()); + try!(self.must_consume(")")); + Ok(Some(f)) + } else if self.could_consume("[") { + Ok(Some(try!(self.array()))) + } else if self.could_consume("{") { + Ok(Some(try!(self.object()))) + } else { + Ok(None) + } + } + + fn array<'b>(&'b mut self) -> Result, Error> { + if !self.consume("[") { + return Err(Error::Parse("Expected '['".to_string())); + } + self.kb.push_array(); + let filter = try!(self.abool()); + self.kb.pop_array(); + try!(self.must_consume("]")); + Ok(filter) + } + + pub fn sort_clause(&mut self) -> Result, Error> { + let mut sort_infos = HashMap::new(); + if self.consume("sort") { + loop { + if let Some(kb) = try!(self.consume_keypath()) { + // doing the search for source 2x so user can order + // anyway they like. Yes it's a hack, but it simple. + let mut sort = if self.consume("asc") { + Sort::Asc + } else if self.consume("desc") { + Sort::Desc + } else { + Sort::Asc + }; + + let default = if self.consume("default") { + try!(self.must_consume("=")); + if let Some(json) = try!(self.json()) { + json + } else { + return Err(Error::Parse("Expected Json after default.".to_string())); + } + } else { + JsonValue::Null + }; + + sort = if self.consume("asc") { + Sort::Asc + } else if self.consume("desc") { + Sort::Desc + } else { + sort + }; + + sort_infos.insert(kb.value_key(0), SortInfo{kb:kb, + sort:sort, + default:default}); + if !self.consume(",") { + break; + } + } + } + if sort_infos.is_empty() { + return Err(Error::Parse("Expected field path in sort expression.".to_string())); + } + } + Ok(sort_infos) + } + + pub fn return_clause(&mut self) -> Result, Error> { + if self.consume("return") { + if let Some(ret_value) = try!(self.ret_value()) { + Ok(ret_value) + } else { + Err(Error::Parse("Expected key, object or array to return.".to_string())) + } + } else { + let mut kb = KeyBuilder::new(); + kb.push_object_key("_id"); + Ok(Box::new(RetValue{kb: kb, ag:None, default: JsonValue::Null, sort: None})) + } + } + + fn ret_object(&mut self) -> Result, Error> { + try!(self.must_consume("{")); + let mut fields: Vec<(String, Box)> = Vec::new(); + loop { + if let Some(field) = try!(self.consume_key()) { + try!(self.must_consume(":")); + if let Some(ret_value) = try!(self.ret_value()) { + fields.push((field, ret_value)); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Expected key to return.".to_string())); + } + } else { + break; + } + } + + try!(self.must_consume("}")); + Ok(Box::new(RetObject{fields: fields})) + } + + fn ret_array(&mut self) -> Result, Error> { + try!(self.must_consume("[")); + let mut slots = Vec::new(); + loop { + if let Some(ret_value) = try!(self.ret_value()) { + slots.push(ret_value); + if !self.consume(",") { + break; + } + } else { + break; + } + } + try!(self.must_consume("]")); + Ok(Box::new(RetArray{slots: slots})) + + } + + fn ret_value(&mut self) -> Result>, Error> { + if let Some((ag, kb, json)) = try!(self.consume_aggregate()) { + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + Ok(Some(Box::new(RetValue{kb: kb, ag: Some((ag, json)), + default: default, sort:None}))) + } + else if let Some(kb) = try!(self.consume_keypath()) { + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + Ok(Some(Box::new(RetValue{kb: kb, ag: None, default: default, sort: None}))) + } else if self.could_consume("{") { + Ok(Some(try!(self.ret_object()))) + } else if self.could_consume("[") { + Ok(Some(try!(self.ret_array()))) + } else if let Some(string) = try!(self.consume_string_literal()) { + Ok(Some(Box::new(RetLiteral{json: JsonValue::String(string)}))) + } else if let Some(num) = try!(self.consume_number()) { + Ok(Some(Box::new(RetLiteral{json: JsonValue::Number(num)}))) + } else { + if self.consume("true") { + Ok(Some(Box::new(RetLiteral{json: JsonValue::True}))) + } else if self.consume("false") { + Ok(Some(Box::new(RetLiteral{json: JsonValue::False}))) + } else if self.consume("null") { + Ok(Some(Box::new(RetLiteral{json: JsonValue::Null}))) + } else { + Ok(None) + } + } + } + + pub fn limit_clause(&mut self) -> Result { + if self.consume("limit") { + if let Some(i) = try!(self.consume_integer()) { + if i <= 0 { + return Err(Error::Parse("limit must be an integer greater than 0" + .to_string())); + } + Ok(i as usize) + } else { + return Err(Error::Parse("limit expects an integer greater than 0" + .to_string())); + } + } else { + Ok(usize::MAX) + } + } + + fn json(&mut self) -> Result, Error> { + if self.could_consume("{") { + Ok(Some(try!(self.json_object()))) + } else if self.could_consume("[") { + Ok(Some(try!(self.json_array()))) + } else if let Some(string) = try!(self.consume_string_literal()) { + Ok(Some(JsonValue::String(string))) + } else { + if self.consume("true") { + Ok(Some(JsonValue::True)) + } else if self.consume("false") { + Ok(Some(JsonValue::False)) + } else if self.consume("null") { + Ok(Some(JsonValue::Null)) + } else if let Some(num) = try!(self.consume_number()) { + Ok(Some(JsonValue::Number(num))) + } else { + Ok(None) + } + } + } + + fn json_object(&mut self) -> Result { + try!(self.must_consume("{")); + let mut object = Vec::new(); + if self.consume("}") { + return Ok(JsonValue::Object(object)); + } + loop { + if let Some(field) = try!(self.consume_key()) { + try!(self.must_consume(":")); + if let Some(json) = try!(self.json()) { + object.push((field, json)); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } + try!(self.must_consume("}")); + Ok(JsonValue::Object(object)) + } + + fn json_array(&mut self) -> Result { + try!(self.must_consume("[")); + let mut array = Vec::new(); + if self.consume("]") { + return Ok(JsonValue::Array(array)); + } + loop { + if let Some(json) = try!(self.json()) { + array.push(json); + if !self.consume(",") { + break; + } + } else { + return Err(Error::Parse("Invalid json found".to_string())); + } + } + try!(self.must_consume("]")); + Ok(JsonValue::Array(array)) + } + + pub fn build_filter(&mut self) -> Result, Error> { + self.ws(); + Ok(try!(self.find())) + } +} + +#[cfg(test)] +mod tests { + + use super::Parser; + + use index::{Index, OpenOptions}; + + use rocksdb::Snapshot; + + #[test] + fn test_whitespace() { + let mut index = Index::new(); + index.open("target/tests/test_whitespace", Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + let mut snapshot = Snapshot::new(rocks); + + let mut query = " \n \t test".to_string(); + let mut parser = Parser::new(query, snapshot); + parser.ws(); + assert_eq!(parser.offset, 5); + + snapshot = Snapshot::new(rocks); + query = "test".to_string(); + parser = Parser::new(query, snapshot); + parser.ws(); + assert_eq!(parser.offset, 0); + } + + #[test] + fn test_must_consume_string_literal() { + let mut index = Index::new(); + index.open("target/tests/test_must_consume_string_literal", Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + let snapshot = Snapshot::new(rocks); + + let query = r#"" \n \t test""#.to_string(); + let mut parser = Parser::new(query, snapshot); + assert_eq!(parser.must_consume_string_literal().unwrap(), " \n \t test".to_string()); + } +} \ No newline at end of file diff --git a/src/query.rs b/src/query.rs index ba3e22f..d4dffed 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,6 +1,4 @@ -extern crate capnp; - use std::str; use std::cmp::Ordering; use std::io::Write; @@ -14,9 +12,9 @@ use std::usize; use error::Error; use index::Index; use key_builder::{KeyBuilder, Segment}; -use stems::Stems; -use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, - StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter}; +use parser::Parser; +use json_value::JsonValue; +use filters::QueryRuntimeFilter; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs @@ -485,944 +483,6 @@ impl<'a> QueryResults<'a> { } } -struct Parser<'a> { - query: String, - offset: usize, - kb: KeyBuilder, - snapshot: Snapshot<'a>, -} - -impl<'a> Parser<'a> { - fn new(query: String, snapshot: Snapshot<'a>) -> Parser<'a> { - Parser { - query: query, - offset: 0, - kb: KeyBuilder::new(), - snapshot: snapshot, - } - } - - fn ws(&mut self) { - for char in self.query[self.offset..].chars() { - if !char.is_whitespace() { - break; - } - self.offset += char.len_utf8(); - } - } - - fn consume(&mut self, token: &str) -> bool { - if self.could_consume(token) { - self.offset += token.len(); - self.ws(); - true - } else { - false - } - } - - - fn must_consume(&mut self, token: &str) -> Result<(), Error> { - if self.could_consume(token) { - self.offset += token.len(); - self.ws(); - Ok(()) - } else { - Err(Error::Parse(format!("Expected '{}' at character {}, found {}.", - token, self.offset, - &self.query[self.offset..self.offset+1]))) - } - } - - fn could_consume(&self, token: &str) -> bool { - self.query[self.offset..].starts_with(token) - } - - fn consume_key(&mut self) -> Result, Error> { - if let Some(key) = self.consume_field() { - Ok(Some(key)) - } else if let Some(key) = try!(self.consume_string_literal()) { - Ok(Some(key)) - } else { - Ok(None) - } - } - - fn consume_field(&mut self) -> Option { - let mut result = String::new(); - { - let mut chars = self.query[self.offset..].chars(); - if let Some(c) = chars.next() { - // first char cannot be numeric - if c.is_alphabetic() || '_' == c || '$' == c { - result.push(c); - for c in chars { - if c.is_alphanumeric() || '_' == c || '$' == c { - result.push(c); - } else { - break; - } - } - } - } - } - if result.len() > 0 { - self.offset += result.len(); - self.ws(); - Some(result) - } else { - None - } - } - - fn consume_integer(&mut self) -> Result, Error> { - let mut result = String::new(); - for char in self.query[self.offset..].chars() { - if char >= '0' && char <= '9' { - result.push(char); - } else { - break; - } - } - if !result.is_empty() { - self.offset += result.len(); - self.ws(); - Ok(Some(try!(result.parse()))) - } else { - Ok(None) - } - } - - fn consume_default(&mut self) -> Result, Error> { - if self.consume("default") { - try!(self.must_consume("=")); - if let Some(json) = try!(self.json()) { - Ok(Some(json)) - } else { - Err(Error::Parse("Expected json value for default".to_string())) - } - } else { - Ok(None) - } - } - - fn consume_aggregate(&mut self) -> Result, Error> { - let offset = self.offset; - let mut aggregate_fun = if self.consume("group") { - AggregateFun::GroupAsc - } else if self.consume("sum") { - AggregateFun::Sum - } else if self.consume("max") { - AggregateFun::Max - } else if self.consume("min") { - AggregateFun::Min - } else if self.consume("list") { - AggregateFun::List - } else if self.consume("concat") { - AggregateFun::Concat - } else if self.consume("avg") { - AggregateFun::Avg - } else if self.consume("count") { - AggregateFun::Count - } else { - return Ok(None) - }; - - if self.consume("(") { - if aggregate_fun == AggregateFun::Count { - try!(self.must_consume(")")); - Ok(Some((aggregate_fun, KeyBuilder::new(), JsonValue::Null))) - } else if aggregate_fun == AggregateFun::Concat { - if let Some(kb) = try!(self.consume_keypath()) { - let json = if self.consume("sep") { - try!(self.must_consume("=")); - JsonValue::String(try!(self.must_consume_string_literal())) - } else { - JsonValue::String(",".to_string()) - }; - try!(self.must_consume(")")); - Ok(Some((aggregate_fun, kb, json))) - } else { - Err(Error::Parse("Expected keypath or bind variable".to_string())) - } - } else if let Some(kb) = try!(self.consume_keypath()) { - if self.consume("order") { - try!(self.must_consume("=")); - if self.consume("asc") { - aggregate_fun = AggregateFun::GroupAsc; - } else if self.consume("desc") { - aggregate_fun = AggregateFun::GroupDesc; - } else { - return Err(Error::Parse("Expected asc or desc".to_string())); - } - } - try!(self.must_consume(")")); - - Ok(Some((aggregate_fun, kb, JsonValue::Null))) - } else { - Err(Error::Parse("Expected keypath or bind variable".to_string())) - } - } else { - // this consumed word above might be a Bind var. Unconsume and return nothing. - self.offset = offset; - Ok(None) - } - } - - fn consume_keypath(&mut self) -> Result, Error> { - let key: String = if self.consume(".") { - if self.consume("[") { - let key = try!(self.must_consume_string_literal()); - try!(self.must_consume("]")); - key - } else { - if let Some(key) = self.consume_field() { - key - } else { - self.ws(); - // this means return the whole document - return Ok(Some(KeyBuilder::new())); - } - } - } else { - return Ok(None); - }; - - let mut kb = KeyBuilder::new(); - kb.push_object_key(&key); - loop { - if self.consume("[") { - if let Some(index) = try!(self.consume_integer()) { - kb.push_array_index(index as u64); - } else { - return Err(Error::Parse("Expected array index integer.".to_string())); - } - try!(self.must_consume("]")); - } else if self.consume(".") { - if let Some(key) = self.consume_field() { - kb.push_object_key(&key); - } else { - return Err(Error::Parse("Expected object key.".to_string())); - } - } else { - break; - } - } - self.ws(); - Ok(Some(kb)) - } - - fn consume_number(&mut self) -> Result, Error> { - // Yes this parsing code is hideously verbose. But it conforms exactly to the json spec - // and uses the rust f64 parser, which can't tell us how many characters it used or needs. - - // At the end it then uses the std rust String::parse() method to parse and return - // the f64 value and advance the self.offset. The rust method is a super set of the - // allowable json syntax, so it will parse any valid json floating point number. It might - // return an error if the number is out of bounds. - let mut result = String::new(); - 'outer: loop { - // this loop isn't a loop, it's just there to scope the self borrow - // and then jump to the end to do another borrow (self.ws()) - let mut chars = self.query[self.offset..].chars(); - let mut c = if let Some(c) = chars.next() { - c - } else { - return Ok(None); - }; - - // parse the sign - c = if c == '-' { - result.push('-'); - if let Some(c) = chars.next() { c } else {return Ok(None); } - } else { - c - }; - - // parse the first digit - let mut leading_zero = false; - c = if c == '0' { - result.push('0'); - leading_zero = true; - if let Some(c) = chars.next() { c } else {return Ok(None); } - } else if c >= '1' && c <= '9' { - result.push(c); - if let Some(c) = chars.next() { c } else {return Ok(None); } - } else if result.is_empty() { - // no sign or digits found. not a number - return Ok(None); - } else { - return Err(Error::Parse("Expected digits after sign (-).".to_string())); - }; - - // parse remaning significant digits - if !leading_zero { - // no more digits allowed if first digit is zero - loop { - c = if c >= '0' && c <= '9' { - result.push(c); - if let Some(c) = chars.next() { - c - } else { - break 'outer; - } - } else { - break; - }; - } - } - - // parse decimal - c = if c == '.' { - result.push(c); - if let Some(c) = chars.next() { - c - } else { - return Err(Error::Parse("Expected digits after decimal point.".to_string())); - } - } else { - break 'outer; - }; - - // parse mantissa - let mut found_mantissa = false; - loop { - c = if c >= '0' && c <= '9' { - result.push(c); - found_mantissa = true; - - if let Some(c) = chars.next() { - c - } else { - break 'outer; - } - } else { - if found_mantissa { - break; - } - return Err(Error::Parse("Expected digits after decimal point.".to_string())); - }; - } - - // parse exponent symbol - c = if c == 'e' || c == 'E' { - result.push(c); - if let Some(c) = chars.next() { - c - } else { - return Err(Error::Parse("Expected exponent after e.".to_string())); - } - } else { - break 'outer; - }; - - // parse exponent sign - c = if c == '+' || c == '-' { - result.push(c); - if let Some(c) = chars.next() { - c - } else { - return Err(Error::Parse("Expected exponent after e.".to_string())); - } - } else { - c - }; - - // parse exponent digits - let mut found_exponent = false; - loop { - c = if c >= '0' && c <= '9' { - result.push(c); - found_exponent = true; - if let Some(c) = chars.next() { - c - } else { - break 'outer; - } - } else { - if found_exponent { - break 'outer; - } - return Err(Error::Parse("Expected exponent after e.".to_string())); - } - } - } - - self.offset += result.len(); - self.ws(); - Ok(Some(try!(result.parse()))) - } - - - fn must_consume_string_literal(&mut self) -> Result { - if let Some(string) = try!(self.consume_string_literal()) { - Ok(string) - } else { - Err(Error::Parse("Expected string literal.".to_string())) - } - } - - fn consume_string_literal(&mut self) -> Result, Error> { - let mut lit = String::new(); - if !self.could_consume("\"") { - return Ok(None); - } - // can't consume("\"") the leading quote because it will also skip leading whitespace - // inside the string literal - self.offset += 1; - { - let mut chars = self.query[self.offset..].chars(); - 'outer: loop { - let char = if let Some(char) = chars.next() { - char - } else { - break; - }; - if char == '\\' { - self.offset += 1; - - let char = if let Some(char) = chars.next() { - char - } else { - break; - }; - match char { - '\\' | '"' | '/' => lit.push(char), - 'n' => lit.push('\n'), - 'b' => lit.push('\x08'), - 'r' => lit.push('\r'), - 'f' => lit.push('\x0C'), - 't' => lit.push('\t'), - 'v' => lit.push('\x0B'), - 'u' => { - let mut n = 0; - for _i in 0..4 { - let char = if let Some(char) = chars.next() { - char - } else { - break 'outer; - }; - n = match char { - c @ '0' ... '9' => n * 16 + ((c as u16) - ('0' as u16)), - c @ 'a' ... 'f' => n * 16 + (10 + (c as u16) - ('a' as u16)), - c @ 'A' ... 'F' => n * 16 + (10 + (c as u16) - ('A' as u16)), - _ => return Err(Error::Parse(format!( - "Invalid hexidecimal escape: {}", char))), - }; - - } - self.offset += 3; // 3 because 1 is always added after the match below - }, - _ => return Err(Error::Parse(format!("Unknown character escape: {}", - char))), - }; - self.offset += 1; - } else { - if char == '"' { - break; - } else { - lit.push(char); - self.offset += char.len_utf8(); - } - } - } - } - try!(self.must_consume("\"")); - Ok(Some(lit)) - } - -/* - -find - = "find" ws object ws - -object - = "{" ws obool ws "}" ws (("&&" / "||") ws object)? - / parens - -parens - = "(" ws object ws ")" - -obool - = ws ocompare ws (('&&' / ',' / '||') ws obool)? - -ocompare - = oparens - / key ws ":" ws (oparens / compare) - -oparens - = '(' ws obool ws ')' ws - / array - / object - -compare - = ("==" / "~=" / "~" digits "=" ) ws string ws - -abool - = ws acompare ws (('&&'/ ',' / '||') ws abool)? - -acompare - = aparens - / compare - -aparens - = '(' ws abool ')' ws - / array - / object - -array - = '[' ws abool ']' ws - -key - = field / string - -field - = [a-z_$]i [a-z_$0-9]i* - -string - = '"' ('\\\\' / '\\' [\"tfvrnb] / [^\\\"])* '"' ws - -digits - = [0-9]+ - -ws - = [ \t\n\r]* - -ws1 - = [ \t\n\r]+ -*/ - - - fn find<'b>(&'b mut self) -> Result, Error> { - if !self.consume("find") { - return Err(Error::Parse("Missing 'find' keyword".to_string())); - } - self.object() - } - - fn object<'b>(&'b mut self) -> Result, Error> { - if self.consume("{") { - let left = try!(self.obool()); - try!(self.must_consume("}")); - - if self.consume("&&") { - let right = try!(self.object()); - Ok(Box::new(AndFilter::new(vec![left, right], self.kb.arraypath_len()))) - - } else if self.consume("||") { - let right = try!(self.object()); - Ok(Box::new(OrFilter::new(left, right, self.kb.arraypath_len()))) - } else { - Ok(left) - } - } else { - self.parens() - } - } - - fn parens<'b>(&'b mut self) -> Result, Error> { - try!(self.must_consume("(")); - let filter = try!(self.object()); - try!(self.must_consume(")")); - Ok(filter) - } - - fn obool<'b>(&'b mut self) -> Result, Error> { - let mut filter = try!(self.ocompare()); - loop { - filter = if self.consume("&&") || self.consume(",") { - let right = try!(self.obool()); - Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) - } else if self.consume("||") { - let right = try!(self.obool()); - Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) - } else { - break; - } - } - Ok(filter) - } - - fn ocompare<'b>(&'b mut self) -> Result, Error> { - if let Some(filter) = try!(self.oparens()) { - Ok(filter) - } else if let Some(field) = try!(self.consume_key()) { - self.kb.push_object_key(&field); - try!(self.must_consume(":")); - if let Some(filter) = try!(self.oparens()) { - self.kb.pop_object_key(); - Ok(filter) - } else { - let filter = try!(self.compare()); - self.kb.pop_object_key(); - Ok(filter) - } - } else { - Err(Error::Parse("Expected object key or '('".to_string())) - } - } - - fn oparens<'b>(&'b mut self) -> Result>, Error> { - if self.consume("(") { - let f = try!(self.obool()); - try!(self.must_consume(")")); - Ok(Some(f)) - } else if self.could_consume("[") { - Ok(Some(try!(self.array()))) - } else if self.could_consume("{") { - Ok(Some(try!(self.object()))) - } else { - Ok(None) - } - } - - fn compare<'b>(&'b mut self) -> Result, Error> { - if self.consume("==") { - let literal = try!(self.must_consume_string_literal()); - let stems = Stems::new(&literal); - let mut filters: Vec> = Vec::new(); - for stem in stems { - let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = Box::new(ExactMatchFilter::new( - iter, &stem, &self.kb)); - filters.push(filter); - } - match filters.len() { - 0 => panic!("Cannot create a ExactMatchFilter"), - 1 => Ok(filters.pop().unwrap()), - _ => Ok(Box::new(AndFilter::new(filters, self.kb.arraypath_len()))), - } - } else if self.consume("~=") { - // regular search - let literal = try!(self.must_consume_string_literal()); - let stems = Stems::new(&literal); - let stemmed_words: Vec = stems.map(|stem| stem.stemmed).collect(); - - match stemmed_words.len() { - 0 => panic!("Cannot create a StemmedWordFilter"), - 1 => { - let iter = self.snapshot.iterator(IteratorMode::Start); - Ok(Box::new(StemmedWordFilter::new(iter, &stemmed_words[0], &self.kb))) - }, - _ => { - let mut filters: Vec = Vec::new(); - for stemmed_word in stemmed_words { - let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = StemmedWordPosFilter::new(iter, &stemmed_word, &self.kb); - filters.push(filter); - } - Ok(Box::new(StemmedPhraseFilter::new(filters))) - }, - } - } else if self.consume("~") { - let word_distance = match try!(self.consume_integer()) { - Some(int) => int, - None => { - return Err(Error::Parse("Expected integer for proximity search".to_string())); - }, - }; - try!(self.must_consume("=")); - - let literal = try!(self.must_consume_string_literal()); - let stems = Stems::new(&literal); - let mut filters: Vec = Vec::new(); - for stem in stems { - let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = StemmedWordPosFilter::new( - iter, &stem.stemmed, &self.kb); - filters.push(filter); - } - match filters.len() { - 0 => panic!("Cannot create a DistanceFilter"), - _ => Ok(Box::new(DistanceFilter::new(filters, word_distance))), - } - } else { - Err(Error::Parse("Expected comparison operator".to_string())) - } - } - - fn abool<'b>(&'b mut self) -> Result, Error> { - let mut filter = try!(self.acompare()); - loop { - filter = if self.consume("&&") || self.consume(",") { - let right = try!(self.abool()); - Box::new(AndFilter::new(vec![filter, right], self.kb.arraypath_len())) - } else if self.consume("||") { - let right = try!(self.abool()); - Box::new(OrFilter::new(filter, right, self.kb.arraypath_len())) - } else { - break; - } - } - Ok(filter) - } - - fn acompare<'b>(&'b mut self) -> Result, Error> { - if let Some(filter) = try!(self.aparens()) { - Ok(filter) - } else { - self.compare() - } - } - - fn aparens<'b>(&'b mut self) -> Result>, Error> { - if self.consume("(") { - let f = try!(self.abool()); - try!(self.must_consume(")")); - Ok(Some(f)) - } else if self.could_consume("[") { - Ok(Some(try!(self.array()))) - } else if self.could_consume("{") { - Ok(Some(try!(self.object()))) - } else { - Ok(None) - } - } - - fn array<'b>(&'b mut self) -> Result, Error> { - if !self.consume("[") { - return Err(Error::Parse("Expected '['".to_string())); - } - self.kb.push_array(); - let filter = try!(self.abool()); - self.kb.pop_array(); - try!(self.must_consume("]")); - Ok(filter) - } - - fn sort_clause(&mut self) -> Result, Error> { - let mut sort_infos = HashMap::new(); - if self.consume("sort") { - loop { - if let Some(kb) = try!(self.consume_keypath()) { - // doing the search for source 2x so user can order - // anyway they like. Yes it's a hack, but it simple. - let mut sort = if self.consume("asc") { - Sort::Asc - } else if self.consume("desc") { - Sort::Desc - } else { - Sort::Asc - }; - - let default = if self.consume("default") { - try!(self.must_consume("=")); - if let Some(json) = try!(self.json()) { - json - } else { - return Err(Error::Parse("Expected Json after default.".to_string())); - } - } else { - JsonValue::Null - }; - - sort = if self.consume("asc") { - Sort::Asc - } else if self.consume("desc") { - Sort::Desc - } else { - sort - }; - - sort_infos.insert(kb.value_key(0), SortInfo{kb:kb, - sort:sort, - default:default}); - if !self.consume(",") { - break; - } - } - } - if sort_infos.is_empty() { - return Err(Error::Parse("Expected field path in sort expression.".to_string())); - } - } - Ok(sort_infos) - } - - fn return_clause(&mut self) -> Result, Error> { - if self.consume("return") { - if let Some(ret_value) = try!(self.ret_value()) { - Ok(ret_value) - } else { - Err(Error::Parse("Expected key, object or array to return.".to_string())) - } - } else { - let mut kb = KeyBuilder::new(); - kb.push_object_key("_id"); - Ok(Box::new(RetValue{kb: kb, ag:None, default: JsonValue::Null, sort: None})) - } - } - - fn ret_object(&mut self) -> Result, Error> { - try!(self.must_consume("{")); - let mut fields: Vec<(String, Box)> = Vec::new(); - loop { - if let Some(field) = try!(self.consume_key()) { - try!(self.must_consume(":")); - if let Some(ret_value) = try!(self.ret_value()) { - fields.push((field, ret_value)); - if !self.consume(",") { - break; - } - } else { - return Err(Error::Parse("Expected key to return.".to_string())); - } - } else { - break; - } - } - - try!(self.must_consume("}")); - Ok(Box::new(RetObject{fields: fields})) - } - - fn ret_array(&mut self) -> Result, Error> { - try!(self.must_consume("[")); - let mut slots = Vec::new(); - loop { - if let Some(ret_value) = try!(self.ret_value()) { - slots.push(ret_value); - if !self.consume(",") { - break; - } - } else { - break; - } - } - try!(self.must_consume("]")); - Ok(Box::new(RetArray{slots: slots})) - - } - - fn ret_value(&mut self) -> Result>, Error> { - if let Some((ag, kb, json)) = try!(self.consume_aggregate()) { - let default = if let Some(default) = try!(self.consume_default()) { - default - } else { - JsonValue::Null - }; - Ok(Some(Box::new(RetValue{kb: kb, ag: Some((ag, json)), - default: default, sort:None}))) - } - else if let Some(kb) = try!(self.consume_keypath()) { - let default = if let Some(default) = try!(self.consume_default()) { - default - } else { - JsonValue::Null - }; - Ok(Some(Box::new(RetValue{kb: kb, ag: None, default: default, sort: None}))) - } else if self.could_consume("{") { - Ok(Some(try!(self.ret_object()))) - } else if self.could_consume("[") { - Ok(Some(try!(self.ret_array()))) - } else if let Some(string) = try!(self.consume_string_literal()) { - Ok(Some(Box::new(RetLiteral{json: JsonValue::String(string)}))) - } else if let Some(num) = try!(self.consume_number()) { - Ok(Some(Box::new(RetLiteral{json: JsonValue::Number(num)}))) - } else { - if self.consume("true") { - Ok(Some(Box::new(RetLiteral{json: JsonValue::True}))) - } else if self.consume("false") { - Ok(Some(Box::new(RetLiteral{json: JsonValue::False}))) - } else if self.consume("null") { - Ok(Some(Box::new(RetLiteral{json: JsonValue::Null}))) - } else { - Ok(None) - } - } - } - - fn limit_clause(&mut self) -> Result { - if self.consume("limit") { - if let Some(i) = try!(self.consume_integer()) { - if i <= 0 { - return Err(Error::Parse("limit must be an integer greater than 0" - .to_string())); - } - Ok(i as usize) - } else { - return Err(Error::Parse("limit expects an integer greater than 0" - .to_string())); - } - } else { - Ok(usize::MAX) - } - } - - fn json(&mut self) -> Result, Error> { - if self.could_consume("{") { - Ok(Some(try!(self.json_object()))) - } else if self.could_consume("[") { - Ok(Some(try!(self.json_array()))) - } else if let Some(string) = try!(self.consume_string_literal()) { - Ok(Some(JsonValue::String(string))) - } else { - if self.consume("true") { - Ok(Some(JsonValue::True)) - } else if self.consume("false") { - Ok(Some(JsonValue::False)) - } else if self.consume("null") { - Ok(Some(JsonValue::Null)) - } else if let Some(num) = try!(self.consume_number()) { - Ok(Some(JsonValue::Number(num))) - } else { - Ok(None) - } - } - } - - fn json_object(&mut self) -> Result { - try!(self.must_consume("{")); - let mut object = Vec::new(); - if self.consume("}") { - return Ok(JsonValue::Object(object)); - } - loop { - if let Some(field) = try!(self.consume_key()) { - try!(self.must_consume(":")); - if let Some(json) = try!(self.json()) { - object.push((field, json)); - if !self.consume(",") { - break; - } - } else { - return Err(Error::Parse("Invalid json found".to_string())); - } - } else { - return Err(Error::Parse("Invalid json found".to_string())); - } - } - try!(self.must_consume("}")); - Ok(JsonValue::Object(object)) - } - - fn json_array(&mut self) -> Result { - try!(self.must_consume("[")); - let mut array = Vec::new(); - if self.consume("]") { - return Ok(JsonValue::Array(array)); - } - loop { - if let Some(json) = try!(self.json()) { - array.push(json); - if !self.consume(",") { - break; - } - } else { - return Err(Error::Parse("Invalid json found".to_string())); - } - } - try!(self.must_consume("]")); - Ok(JsonValue::Array(array)) - } - - fn build_filter(&mut self) -> Result, Error> { - self.ws(); - Ok(try!(self.find())) - } -} @@ -1615,187 +675,20 @@ impl AggregateFun { } } -#[derive(PartialEq, PartialOrd, Clone, Debug)] -pub enum JsonValue { - Number(f64), - String(String), - Array(Vec), - Object(Vec<(String, JsonValue)>), - True, - False, - Null, +#[derive(PartialEq, Eq, Clone)] +pub enum Sort { + Asc, + Desc, } -impl JsonValue { - fn str_to_literal(string: &str) ->String { - let mut ret = String::with_capacity(string.len()*2+2); - ret.push('"'); - for c in string.chars() { - if c == '"' || c == '\\' { - ret.push('\\'); - } - ret.push(c); - } - ret.push('"'); - ret - } - - fn cmp_always_equal(_a: &JsonValue, _b: &JsonValue) -> Ordering { - Ordering::Equal - } - - fn cmp_f64(a: &JsonValue, b: &JsonValue) -> Ordering { - if let &JsonValue::Number(a_val) = a { - if let &JsonValue::Number(b_val) = b { - if a_val < b_val { - Ordering::Less - } else if a_val > b_val { - Ordering::Greater - } else { - Ordering::Equal - } - } else { - panic!("cast error in cmp_f64"); - } - } else { - panic!("cast error in cmp_f64"); - } - } - - fn cmp_string(a: &JsonValue, b: &JsonValue) -> Ordering { - if let &JsonValue::String(ref a_val) = a { - if let &JsonValue::String(ref b_val) = b { - // Note we eventually want to switch to a collation library like ICU - a_val.cmp(&b_val) - } else { - panic!("cast error in cmp_string"); - } - } else { - panic!("cast error in cmp_string"); - } - } - - fn cmp_array(a: &JsonValue, b: &JsonValue) -> Ordering { - if let &JsonValue::Array(ref a_val) = a { - if let &JsonValue::Array(ref b_val) = b { - for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { - let order = a_el.cmp(&b_el); - if order != Ordering::Equal { - return order; - } - } - // if we got here all elements were equal. But one array might be longer - // so sort it last - a_val.len().cmp(&b_val.len()) - } else { - panic!("cast error in cmp_array"); - } - } else { - panic!("cast error in cmp_array"); - } - } - - fn cmp_object(a: &JsonValue, b: &JsonValue) -> Ordering { - if let &JsonValue::Object(ref a_val) = a { - if let &JsonValue::Object(ref b_val) = b { - for (a_el, b_el) in a_val.iter().zip(b_val.iter()) { - // compare key - let mut order = a_el.0.cmp(&b_el.0); - if order != Ordering::Equal { - return order; - } - // compare value - order = a_el.1.cmp(&b_el.1); - if order != Ordering::Equal { - return order; - } - } - // if we got here all elements were equal. But one object might be longer - // so sort it last - a_val.len().cmp(&b_val.len()) - } else { - panic!("cast error in cmp_object"); - } - } else { - panic!("cast error in cmp_object"); - } - } - - fn type_sort_order(&self) -> (usize, fn(&JsonValue, &JsonValue) -> Ordering) { - match self { - &JsonValue::Null => (0, JsonValue::cmp_always_equal), - &JsonValue::False => (1, JsonValue::cmp_always_equal), - &JsonValue::True => (2, JsonValue::cmp_always_equal), - &JsonValue::Number(_) => (3, JsonValue::cmp_f64), - &JsonValue::String(_) => (4, JsonValue::cmp_string), - &JsonValue::Array(_) => (5, JsonValue::cmp_array), - &JsonValue::Object(_) => (6, JsonValue::cmp_object), - } - } - - fn render(&self, write: &mut Write) -> Result<(), Error> { - match self { - &JsonValue::Number(ref num) => try!(write.write_all(num.to_string().as_bytes())), - &JsonValue::String(ref string) => { - try!(write.write_all(JsonValue::str_to_literal(&string).as_bytes())) - }, - &JsonValue::Array(ref array) => { - try!(write.write_all("[".as_bytes())); - - let mut iter = array.iter().peekable(); - loop { - match iter.next() { - Some(json) => try!(json.render(write)), - None => break, - } - if iter.peek().is_some() { - try!(write.write_all(",".as_bytes())); - } - } - try!(write.write_all("]".as_bytes())); - }, - &JsonValue::Object(ref object) => { - try!(write.write_all("{".as_bytes())); - - let mut iter = object.iter().peekable(); - loop { - match iter.next() { - Some(&(ref key, ref json)) => { - try!(write.write_all(JsonValue::str_to_literal(&key).as_bytes())); - try!(write.write_all(":".as_bytes())); - try!(json.render(write)); - } - None => break, - } - if iter.peek().is_some() { - try!(write.write_all(",".as_bytes())); - } - } - try!(write.write_all("}".as_bytes())); - }, - &JsonValue::True => try!(write.write_all("true".as_bytes())), - &JsonValue::False => try!(write.write_all("false".as_bytes())), - &JsonValue::Null => try!(write.write_all("null".as_bytes())), - } - Ok(()) - } +pub struct SortInfo { + pub kb: KeyBuilder, + pub sort: Sort, + pub default: JsonValue, } -impl Eq for JsonValue {} - -impl Ord for JsonValue { - fn cmp(&self, other: &JsonValue) -> Ordering { - let (self_order_num, self_cmp_fun) = self.type_sort_order(); - let (other_order_num, _other_cmp_fun) = other.type_sort_order(); - match self_order_num.cmp(&other_order_num) { - Ordering::Less => Ordering::Less, - Ordering::Greater => Ordering::Greater, - Ordering::Equal => self_cmp_fun(self, other), - } - } -} -trait Returnable { +pub trait Returnable { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, bind_var_keys: &HashMap, result: &mut VecDeque) -> Result<(), Error>; @@ -1810,8 +703,8 @@ trait Returnable { write: &mut Write) -> Result<(), Error>; } -struct RetObject { - fields: Vec<(String, Box)>, +pub struct RetObject { + pub fields: Vec<(String, Box)>, } impl Returnable for RetObject { @@ -1865,8 +758,8 @@ impl Returnable for RetObject { } -struct RetArray { - slots: Vec>, +pub struct RetArray { + pub slots: Vec>, } impl Returnable for RetArray { @@ -1916,19 +809,9 @@ impl Returnable for RetArray { } } -#[derive(PartialEq, Eq, Clone)] -enum Sort { - Asc, - Desc, -} -struct SortInfo { - kb: KeyBuilder, - sort: Sort, - default: JsonValue, -} -struct RetHidden { +pub struct RetHidden { unrendered: Vec, visible: Box, } @@ -1970,8 +853,8 @@ impl Returnable for RetHidden { } } -struct RetLiteral { - json: JsonValue, +pub struct RetLiteral { + pub json: JsonValue, } impl Returnable for RetLiteral { @@ -2001,10 +884,10 @@ impl Returnable for RetLiteral { } pub struct RetValue { - kb: KeyBuilder, - ag: Option<(AggregateFun, JsonValue)>, - default: JsonValue, - sort: Option, + pub kb: KeyBuilder, + pub ag: Option<(AggregateFun, JsonValue)>, + pub default: JsonValue, + pub sort: Option, } impl RetValue { @@ -2216,42 +1099,11 @@ impl Returnable for RetValue { mod tests { extern crate rustc_serialize; - use super::{Parser, Query}; + use super::Query; use index::{Index, OpenOptions}; - use rocksdb::Snapshot; - #[test] - fn test_whitespace() { - let mut index = Index::new(); - index.open("target/tests/test_whitespace", Some(OpenOptions::Create)).unwrap(); - let rocks = &index.rocks.unwrap(); - let mut snapshot = Snapshot::new(rocks); - - let mut query = " \n \t test".to_string(); - let mut parser = Parser::new(query, snapshot); - parser.ws(); - assert_eq!(parser.offset, 5); - - snapshot = Snapshot::new(rocks); - query = "test".to_string(); - parser = Parser::new(query, snapshot); - parser.ws(); - assert_eq!(parser.offset, 0); - } - - #[test] - fn test_must_consume_string_literal() { - let mut index = Index::new(); - index.open("target/tests/test_must_consume_string_literal", Some(OpenOptions::Create)).unwrap(); - let rocks = &index.rocks.unwrap(); - let snapshot = Snapshot::new(rocks); - - let query = r#"" \n \t test""#.to_string(); - let mut parser = Parser::new(query, snapshot); - assert_eq!(parser.must_consume_string_literal().unwrap(), " \n \t test".to_string()); - } #[test] fn test_query_hello_world() { From a23e1f7c5836d1c6313025aa64274124340d22e1 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 2 Jan 2017 11:07:33 -0800 Subject: [PATCH 052/122] Bind variable first implementation Currently collects all values into an array. Need options for expanding these rows into arrays. Possibly permuted with multiple bind vars. --- src/filters.rs | 85 +++++++++++++-- src/key_builder.rs | 30 ++++++ src/parser.rs | 121 +++++++++++++++------ src/query.rs | 263 ++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 430 insertions(+), 69 deletions(-) diff --git a/src/filters.rs b/src/filters.rs index 070c613..d1d1c72 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -440,6 +440,7 @@ impl<'a> AndFilter<'a> { if base_result == next_result { matches_count -= 1; + base_result.combine_bind_name_results(&mut next_result); if matches_count == 0 { return Ok(Some(base_result)); } @@ -531,12 +532,12 @@ impl<'a> OrFilter<'a> { } } fn take_smallest(&mut self) -> Option { - if let Some(left) = self.left.result.take() { + if let Some(mut left) = self.left.result.take() { // left exists - if let Some(right) = self.right.result.take() { + if let Some(mut right) = self.right.result.take() { // both exist, return smallest match left.cmp(&right) { - Ordering::Less => { + Ordering::Less => { // left is smallest, return and put back right self.right.result = Some(right); Some(left) @@ -546,9 +547,9 @@ impl<'a> OrFilter<'a> { self.left.result = Some(left); Some(right) }, - Ordering::Equal => { - // return one and discard the other so we don't return - // identical result in a subsequent call + Ordering::Equal => { + left.combine_bind_name_results(&mut right); + self.right.result = Some(right); Some(left) }, } @@ -582,3 +583,75 @@ impl<'a> QueryRuntimeFilter for OrFilter<'a> { Ok(self.take_smallest()) } } + +pub struct BindFilter<'a> { + bind_var_name: String, + filter: Box, + kb: KeyBuilder, + option_next: Option, +} + +impl<'a> BindFilter<'a> { + + pub fn new(bind_var_name: String, + filter: Box, + kb: KeyBuilder) -> BindFilter { + BindFilter { + bind_var_name: bind_var_name, + filter: filter, + kb: kb, + option_next: None, + } + } + + fn collect_results(&mut self, mut first: DocResult) -> Result, Error> { + let value_key = self.kb.value_key_from_doc_result(&first); + first.add_bind_name_result(&self.bind_var_name, value_key); + + while let Some(next) = try!(self.filter.next_result()) { + if next.seq == first.seq { + let value_key = self.kb.value_key_from_doc_result(&next); + first.add_bind_name_result(&self.bind_var_name, value_key); + } else { + self.option_next = Some(next); + return Ok(Some(first)); + } + } + Ok(Some(first)) + } +} + +impl<'a> QueryRuntimeFilter for BindFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + let first = if let Some(next) = self.option_next.take() { + if next >= *start { + Some(next) + } else { + try!(self.filter.first_result(&start)) + } + } else { + try!(self.filter.first_result(&start)) + }; + + if let Some(first) = first { + self.collect_results(first) + } else { + Ok(None) + } + } + + fn next_result(&mut self) -> Result, Error> { + let first = if let Some(next) = self.option_next.take() { + Some(next) + } else { + try!(self.filter.next_result()) + }; + + if let Some(first) = first { + self.collect_results(first) + } else { + Ok(None) + } + } +} + diff --git a/src/key_builder.rs b/src/key_builder.rs index 4a64b68..0f346b1 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -73,6 +73,36 @@ impl KeyBuilder { string } + /// Returns a value key without the doc seq prepended. + pub fn value_key_path_only(&self) -> String { + let mut string = String::with_capacity(100); + let mut i = 0; + for segment in &self.keypath { + string.push_str(&segment); + if segment == "$" { + string.push_str(&self.arraypath[i].to_string()); + i += 1; + } + } + string + } + + pub fn value_key_from_doc_result(&self, dr: &DocResult) -> String { + let mut string = String::with_capacity(100); + string.push('V'); + string.push_str(&dr.seq.to_string()); + string.push('#'); + let mut i = 0; + for segment in &self.keypath { + string.push_str(&segment); + if segment == "$" { + string.push_str(&dr.arraypath[i].to_string()); + i += 1; + } + } + string + } + fn add_arraypath(string: &mut String, arraypath: &Vec) { if arraypath.is_empty() { string.push(','); diff --git a/src/parser.rs b/src/parser.rs index ab8a829..9e715f1 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -8,9 +8,10 @@ use error::Error; use key_builder::KeyBuilder; use stems::Stems; use json_value::JsonValue; -use query::{Sort, Returnable, RetValue, RetObject, RetArray, RetLiteral, AggregateFun, SortInfo}; +use query::{Sort, Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, AggregateFun, + SortInfo}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, - StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter}; + StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs @@ -139,6 +140,7 @@ impl<'a> Parser<'a> { } fn consume_aggregate(&mut self) -> Result, KeyBuilder, JsonValue)>, Error> { let offset = self.offset; @@ -165,8 +167,10 @@ impl<'a> Parser<'a> { if self.consume("(") { if aggregate_fun == AggregateFun::Count { try!(self.must_consume(")")); - Ok(Some((aggregate_fun, KeyBuilder::new(), JsonValue::Null))) + Ok(Some((aggregate_fun, None, KeyBuilder::new(), JsonValue::Null))) } else if aggregate_fun == AggregateFun::Concat { + let bind_name_option = self.consume_field(); + if let Some(kb) = try!(self.consume_keypath()) { let json = if self.consume("sep") { try!(self.must_consume("=")); @@ -175,26 +179,30 @@ impl<'a> Parser<'a> { JsonValue::String(",".to_string()) }; try!(self.must_consume(")")); - Ok(Some((aggregate_fun, kb, json))) + Ok(Some((aggregate_fun, bind_name_option, kb, json))) } else { Err(Error::Parse("Expected keypath or bind variable".to_string())) } - } else if let Some(kb) = try!(self.consume_keypath()) { - if self.consume("order") { - try!(self.must_consume("=")); - if self.consume("asc") { - aggregate_fun = AggregateFun::GroupAsc; - } else if self.consume("desc") { - aggregate_fun = AggregateFun::GroupDesc; - } else { - return Err(Error::Parse("Expected asc or desc".to_string())); + } else { + let bind_name_option = self.consume_field(); + + if let Some(kb) = try!(self.consume_keypath()) { + if self.consume("order") { + try!(self.must_consume("=")); + if self.consume("asc") { + aggregate_fun = AggregateFun::GroupAsc; + } else if self.consume("desc") { + aggregate_fun = AggregateFun::GroupDesc; + } else { + return Err(Error::Parse("Expected asc or desc".to_string())); + } } - } - try!(self.must_consume(")")); + try!(self.must_consume(")")); - Ok(Some((aggregate_fun, kb, JsonValue::Null))) - } else { - Err(Error::Parse("Expected keypath or bind variable".to_string())) + Ok(Some((aggregate_fun, bind_name_option, kb, JsonValue::Null))) + } else { + Err(Error::Parse("Expected keypath or bind variable".to_string())) + } } } else { // this consumed word above might be a Bind var. Unconsume and return nothing. @@ -606,7 +614,11 @@ ws1 } else if self.could_consume("{") { Ok(Some(try!(self.object()))) } else { - Ok(None) + if let Some(filter) = try!(self.bind_var()) { + Ok(Some(filter)) + } else { + Ok(None) + } } } @@ -709,8 +721,28 @@ ws1 } else if self.could_consume("{") { Ok(Some(try!(self.object()))) } else { - Ok(None) + if let Some(filter) = try!(self.bind_var()) { + Ok(Some(filter)) + } else { + Ok(None) + } + } + } + + fn bind_var<'b>(&'b mut self) -> Result>, Error> { + let offset = self.offset; + if let Some(bind_name) = self.consume_field() { + if self.consume("::") { + let filter = try!(self.array()); + self.kb.push_array(); + let kb_clone = self.kb.clone(); + self.kb.pop_array(); + return Ok(Some(Box::new(BindFilter::new(bind_name, filter, kb_clone)))); + } + //we got here so unconsume the chars + self.offset = offset; } + Ok(None) } fn array<'b>(&'b mut self) -> Result, Error> { @@ -829,21 +861,50 @@ ws1 } fn ret_value(&mut self) -> Result>, Error> { - if let Some((ag, kb, json)) = try!(self.consume_aggregate()) { + if self.consume("true") { + return Ok(Some(Box::new(RetLiteral{json: JsonValue::True}))); + } else if self.consume("false") { + return Ok(Some(Box::new(RetLiteral{json: JsonValue::False}))); + } else if self.consume("null") { + return Ok(Some(Box::new(RetLiteral{json: JsonValue::Null}))); + } + + if let Some((ag, bind_name_option, kb, json)) = try!(self.consume_aggregate()) { let default = if let Some(default) = try!(self.consume_default()) { default } else { JsonValue::Null }; - Ok(Some(Box::new(RetValue{kb: kb, ag: Some((ag, json)), - default: default, sort:None}))) - } - else if let Some(kb) = try!(self.consume_keypath()) { + if let Some(bind_name) = bind_name_option { + let extra_key = kb.value_key_path_only(); + Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_key: extra_key, + ag: Some((ag, json)), default: default, sort:None}))) + } else { + Ok(Some(Box::new(RetValue{kb: kb, ag: Some((ag, json)), + default: default, sort:None}))) + } + } else if let Some(bind_name) = self.consume_field() { + let extra_key = if let Some(kb) = try!(self.consume_keypath()) { + kb.value_key_path_only() + } else { + "".to_string() + }; + + let default = if let Some(default) = try!(self.consume_default()) { + default + } else { + JsonValue::Null + }; + + Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_key: extra_key, + ag: None, default: default, sort:None}))) + } else if let Some(kb) = try!(self.consume_keypath()) { let default = if let Some(default) = try!(self.consume_default()) { default } else { JsonValue::Null }; + Ok(Some(Box::new(RetValue{kb: kb, ag: None, default: default, sort: None}))) } else if self.could_consume("{") { Ok(Some(try!(self.ret_object()))) @@ -854,15 +915,7 @@ ws1 } else if let Some(num) = try!(self.consume_number()) { Ok(Some(Box::new(RetLiteral{json: JsonValue::Number(num)}))) } else { - if self.consume("true") { - Ok(Some(Box::new(RetLiteral{json: JsonValue::True}))) - } else if self.consume("false") { - Ok(Some(Box::new(RetLiteral{json: JsonValue::False}))) - } else if self.consume("null") { - Ok(Some(Box::new(RetLiteral{json: JsonValue::Null}))) - } else { - Ok(None) - } + Ok(None) } } diff --git a/src/query.rs b/src/query.rs index d4dffed..b1de97d 100644 --- a/src/query.rs +++ b/src/query.rs @@ -16,15 +16,15 @@ use parser::Parser; use json_value::JsonValue; use filters::QueryRuntimeFilter; - // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs use rocksdb::{self, DBIterator, IteratorMode, Snapshot}; -#[derive(PartialEq, Eq, PartialOrd, Clone)] +#[derive(Clone)] pub struct DocResult { pub seq: u64, pub arraypath: Vec, + pub bind_name_result: HashMap>, } impl DocResult { @@ -32,10 +32,49 @@ impl DocResult { DocResult { seq: 0, arraypath: Vec::new(), + bind_name_result: HashMap::new(), + } + } + + pub fn add_bind_name_result(&mut self, bind_name: &str, result_key: String) { + if let Some(ref mut result_keys) = self.bind_name_result.get_mut(bind_name) { + result_keys.push(result_key); + return; + } + self.bind_name_result.insert(bind_name.to_string(), vec![result_key]); + } + + pub fn combine_bind_name_results(&mut self, other: &mut DocResult) { + let mut replace = HashMap::new(); + swap(&mut replace, &mut other.bind_name_result); + for (bind_name, mut result_keys_other) in replace.into_iter() { + if let Some(ref mut result_keys) = self.bind_name_result.get_mut(&bind_name) { + result_keys.append(&mut result_keys_other); + continue; + } + self.bind_name_result.insert(bind_name, result_keys_other); } } } +impl PartialEq for DocResult { + fn eq(&self, other: &DocResult) -> bool { + if self.seq != other.seq { + false + } else { + self.arraypath == other.arraypath + } + } +} + +impl Eq for DocResult {} + +impl PartialOrd for DocResult { + fn partial_cmp(&self, other: &DocResult) -> Option { + Some(self.cmp(other)) + } +} + impl Ord for DocResult { fn cmp(&self, other: &DocResult) -> Ordering { match self.seq.cmp(&other.seq) { @@ -221,7 +260,7 @@ impl<'a> QueryResults<'a> { } } - fn get_next(&mut self) -> Result, Error> { + fn get_next_result(&mut self) -> Result, Error> { if self.done_with_sorting_and_ags { return Ok(None); } @@ -229,12 +268,20 @@ impl<'a> QueryResults<'a> { match result { Some(doc_result) => { self.doc_result_next.seq = doc_result.seq + 1; - Ok(Some(doc_result.seq)) + Ok(Some(doc_result)) }, None => Ok(None), } } + fn get_next(&mut self) -> Result, Error> { + if let Some(doc_result) = try!(self.get_next_result()) { + Ok(Some(doc_result.seq)) + } else { + Ok(None) + } + } + pub fn get_next_id(&mut self) -> Result, Error> { let seq = try!(self.get_next()); match seq { @@ -253,12 +300,16 @@ impl<'a> QueryResults<'a> { pub fn next_result(&mut self) -> Result, Error> { if self.needs_sorting_and_ags { loop { - match if self.done_with_sorting_and_ags { None } else { try!(self.get_next()) } { - Some(seq) => { - let bind = HashMap::new(); + let next = if self.done_with_sorting_and_ags { + None + } else { + try!(self.get_next_result()) + }; + match next { + Some(dr) => { let mut results = VecDeque::new(); - try!(self.returnable.fetch_result(&mut self.iter, seq, - &bind, &mut results)); + try!(self.returnable.fetch_result(&mut self.iter, dr.seq, + &dr.bind_name_result, &mut results)); self.in_buffer.push(results); if self.in_buffer.len() == self.limit { self.do_sorting_and_ags(); @@ -289,13 +340,13 @@ impl<'a> QueryResults<'a> { } } } else { - let seq = match try!(self.get_next()) { - Some(seq) => seq, + let dr = match try!(self.get_next_result()) { + Some(dr) => dr, None => return Ok(None), }; - let bind = HashMap::new(); let mut results = VecDeque::new(); - try!(self.returnable.fetch_result(&mut self.iter, seq, &bind, &mut results)); + try!(self.returnable.fetch_result(&mut self.iter, dr.seq, + &dr.bind_name_result, &mut results)); self.buffer.clear(); try!(self.returnable.write_result(&mut results, &mut self.buffer)); Ok(Some(unsafe{str::from_utf8_unchecked(&self.buffer[..])}.to_string())) @@ -690,7 +741,7 @@ pub struct SortInfo { pub trait Returnable { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, - bind_var_keys: &HashMap, + bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error>; fn get_aggregate_funs(&self, funs: &mut Vec>); @@ -709,7 +760,7 @@ pub struct RetObject { impl Returnable for RetObject { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, - bind_var_keys: &HashMap, + bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { for &(ref _key, ref field) in self.fields.iter() { try!(field.fetch_result(iter, seq, bind_var_keys, result)); @@ -764,7 +815,7 @@ pub struct RetArray { impl Returnable for RetArray { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, - bind_var_keys: &HashMap, + bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { for ref slot in self.slots.iter() { try!(slot.fetch_result(iter, seq, bind_var_keys, result)); @@ -818,7 +869,7 @@ pub struct RetHidden { impl Returnable for RetHidden { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, - bind_var_keys: &HashMap, + bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { for ref mut unrendered in self.unrendered.iter() { try!(unrendered.fetch_result(iter, seq, bind_var_keys, result)); @@ -859,7 +910,7 @@ pub struct RetLiteral { impl Returnable for RetLiteral { fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, - _bind_var_keys: &HashMap, + _bind_var_keys: &HashMap>, _result: &mut VecDeque) -> Result<(), Error> { Ok(()) } @@ -1027,23 +1078,15 @@ impl RetValue { impl Returnable for RetValue { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, - bind_var_keys: &HashMap, + _bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { //don't fetch anything for count(). just stick in a null result.push_back(JsonValue::Null); return Ok(()); } - let value_key = if self.kb.keypath_segments_len() == 1 { - let key = self.kb.peek_object_key(); - if let Some(value_key) = bind_var_keys.get(&key) { - value_key.to_string() - } else { - self.kb.value_key(seq) - } - } else { - self.kb.value_key(seq) - }; + + let value_key = self.kb.value_key(seq); // Seek in index to >= entry iter.set_mode(IteratorMode::From(value_key.as_bytes(), @@ -1093,6 +1136,75 @@ impl Returnable for RetValue { } } +pub struct RetBind { + pub bind_name: String, + pub extra_key: String, + pub ag: Option<(AggregateFun, JsonValue)>, + pub default: JsonValue, + pub sort: Option, +} + + +impl Returnable for RetBind { + fn fetch_result(&self, iter: &mut DBIterator, _seq: u64, + bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + + if let Some(value_keys) = bind_var_keys.get(&self.bind_name) { + let mut array = Vec::with_capacity(value_keys.len()); + for base_key in value_keys { + // Seek in index to >= entry + let value_key = base_key.to_string() + &self.extra_key; + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => { + result.push_back(self.default.clone()); + return Ok(()) + }, + }; + + if !key.starts_with(value_key.as_bytes()) { + array.push(self.default.clone()); + } else { + array.push(try!(RetValue::fetch(&mut iter.peekable(), &value_key, + key, value))); + } + } + result.push_back(JsonValue::Array(array)); + } else { + result.push_back(JsonValue::Array(vec![self.default.clone()])) + } + + Ok(()) + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + funs.push(self.ag.clone()); + } + + fn get_sorting(&self, sorts: &mut Vec>) { + sorts.push(self.sort.clone()); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + if let Some(sort_info) = map.remove(&(self.bind_name.to_string() + &self.extra_key)) { + self.sort = Some(sort_info.sort); + } + } + + fn write_result(&self, results: &mut VecDeque, + write: &mut Write) -> Result<(), Error> { + if let Some(json) = results.pop_front() { + try!(json.render(write)); + } else { + panic!("missing result!"); + } + Ok(()) + } +} #[cfg(test)] @@ -1569,6 +1681,99 @@ mod tests { } + #[test] + fn test_query_bind_var() { + let dbname = "target/tests/querytestbindvar"; + + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + + assert_eq!(Ok(()), index.add(r#"{"_id":"1", "bar": [{"a":"foo","v":1},{"a":"bar","v":2}]}"#)); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}]} + return x "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[{"a":"foo","v":1}]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}]} + return x.v "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo" || a: =="bar"}]} + return x.v "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo" || a: =="baz"}]} + return x.v "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[1]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foof" || a: =="bar"}]} + return x.v "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] || bar: x::[{a: =="bar"}]} + return x.v "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,2]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="bar"}]} + return [x.v, y.v] "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[[1],[2]]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="baz"}]} + return [x.v, y.v default=0] "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#"[[1],[0]]"#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] && bar: y::[{a: =="baz"}]} + return [x.v, y.v] "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(), None); + } + } + + #[test] fn test_query_more_docs() { let dbname = "target/tests/querytestdbmoredocs"; From dac305a38b15337418ebff674f4f91bfa6454d6e Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 7 Jan 2017 14:49:29 -0800 Subject: [PATCH 053/122] relevancy scoring and boosting Use `score() desc` in sort clause to order by most relevant first. Use `score()` in return clause to return score to client/user. Use `^2.0` after comparison string, array, object or parens to boost the query clause(s) preceding. --- capnp/records.capnp | 12 +- src/filters.rs | 343 ++++++++++++++++++++++-------- src/index.rs | 40 +++- src/json_shred.rs | 66 ++++-- src/key_builder.rs | 20 ++ src/parser.rs | 148 ++++++++++--- src/query.rs | 493 +++++++++++++++++++++++++++++++++----------- src/stems.rs | 10 +- 8 files changed, 866 insertions(+), 266 deletions(-) diff --git a/capnp/records.capnp b/capnp/records.capnp index ccd3d7d..c1dcd81 100644 --- a/capnp/records.capnp +++ b/capnp/records.capnp @@ -5,23 +5,18 @@ struct Header { highSeq @1 :UInt64; } -enum Case { - uppercase @0; - propercase @1; -} - struct Payload { struct Wordinfo { # Contains stemmed word and information about the orignal word before stemming # the position of the word in the text field - wordPos @0 :UInt64; + wordPos @0 :UInt32; # the offset of the suffix from the start of the stemmed word # when combined with the stemmed word gets back the orignal # text with case preserved - suffixOffset @1 :UInt64; + suffixOffset @1 :UInt32; # the actual suffix text, which can start at any point in the stemmed word suffixText @2 :Text; @@ -32,6 +27,7 @@ struct Payload { # need to be stored in the suffix text for most words at the cost of 1 byte per word # info. } - wordinfos @0 :List(Wordinfo); + totalWords @0: UInt32; + wordinfos @1 :List(Wordinfo); } diff --git a/src/filters.rs b/src/filters.rs index d1d1c72..0f19f40 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -4,38 +4,112 @@ use std::str; use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::HashSet; +use index::Index; +use std::f32; use error::Error; use key_builder::KeyBuilder; use stems::StemmedWord; -use query::DocResult; +use query::{DocResult, QueryScoringInfo}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs use rocksdb::{self, DBIterator, IteratorMode}; use records_capnp::payload; +struct Scorer { + idf: f32, + boost: f32, + keypathword_count_key: String, + keypath_count_key: String, + term_ordinal: usize, +} + +impl Scorer { + fn new(word: &str, kb: &KeyBuilder, boost: f32) -> Scorer { + Scorer { + idf: f32::NAN, + boost: boost, + keypathword_count_key: kb.keypathword_count_key(&word), + keypath_count_key: kb.keypath_count_key(), + term_ordinal: 0, + } + } + + fn init(&mut self, mut iter: &mut DBIterator, qsi: &mut QueryScoringInfo) { + let doc_freq = if let Some(bytes) = self.get_value(&mut iter, + &self.keypathword_count_key) { + Index::convert_bytes_to_u64(bytes.as_ref()) as f32 + } else { + 0.0 + }; + + let num_docs = if let Some(bytes) = self.get_value(&mut iter, &self.keypath_count_key) { + Index::convert_bytes_to_u64(bytes.as_ref()) as f32 + } else { + 0.0 + }; + + self.idf = 1.0 + (num_docs/(doc_freq + 1.0)).ln(); + self.term_ordinal = qsi.num_terms; + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += self.idf * self.idf; + } + + fn get_value(&self, iter: &mut DBIterator, key: &String) -> Option> { + iter.set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); + if let Some((ret_key, ret_value)) = iter.next() { + if ret_key.len() == key.len() && ret_key.starts_with(key.as_bytes()) { + Some(ret_value) + } else { + None + } + } else { + None + } + } + + fn add_match_score(&self, num_matches: u32, + total_field_words: u32, dr: &mut DocResult) { + if self.should_score() { + let tf: f32 = (num_matches as f32).sqrt(); + let norm = 1.0/(total_field_words as f32).sqrt(); + let score = self.idf * self.idf * tf * norm * self.boost; + dr.add_score(self.term_ordinal, score); + } + } + + fn should_score(&self) -> bool { + !self.idf.is_nan() + } +} pub trait QueryRuntimeFilter { fn first_result(&mut self, start: &DocResult) -> Result, Error>; fn next_result(&mut self) -> Result, Error>; + fn prepare_relevancy_scoring(&mut self, qsi: &mut QueryScoringInfo); } pub struct ExactMatchFilter { iter: DBIterator, keypathword: String, - word_pos: u64, + word_pos: u32, + suffix_offset: u32, suffix: String, - suffix_offset: u64, + scorer: Scorer, } + + impl ExactMatchFilter { - pub fn new(iter: DBIterator, stemmed_word: &StemmedWord, kb: &KeyBuilder) -> ExactMatchFilter { + pub fn new(iter: DBIterator, stemmed_word: &StemmedWord, + kb: &KeyBuilder, boost: f32) -> ExactMatchFilter { ExactMatchFilter{ iter: iter, keypathword: kb.get_keypathword_only(&stemmed_word.stemmed), - word_pos: stemmed_word.word_pos as u64, + word_pos: stemmed_word.word_pos, suffix: stemmed_word.suffix.clone(), - suffix_offset: stemmed_word.suffix_offset as u64, + suffix_offset: stemmed_word.suffix_offset, + scorer: Scorer::new(&stemmed_word.stemmed, &kb, boost), } } } @@ -55,10 +129,6 @@ impl QueryRuntimeFilter for ExactMatchFilter { fn next_result(&mut self) -> Result, Error> { loop { - if !self.iter.valid() { - return Ok(None) - } - let (key, value) = match self.iter.next() { Some((key, value)) => (key, value), None => return Ok(None), @@ -74,30 +144,40 @@ impl QueryRuntimeFilter for ExactMatchFilter { let message_reader = ::capnp::serialize_packed::read_message( &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); let payload = message_reader.get_root::().unwrap(); - - for wi in try!(payload.get_wordinfos()).iter() { + let wordinfos = try!(payload.get_wordinfos()); + for wi in wordinfos.iter() { if self.word_pos == wi.get_word_pos() && self.suffix_offset == wi.get_suffix_offset() && self.suffix == try!(wi.get_suffix_text()) { // We have a candidate document to return let key_str = unsafe{str::from_utf8_unchecked(&key)}; - return Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))); + let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); + self.scorer.add_match_score(wordinfos.len(), + payload.get_total_words(), &mut dr); + return Ok(Some(dr)); } } } } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.scorer.init(&mut self.iter, &mut qsi); + } } pub struct StemmedWordFilter { iter: DBIterator, keypathword: String, + scorer: Scorer, } impl StemmedWordFilter { - pub fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordFilter { + pub fn new(iter: DBIterator, stemmed_word: &str, + kb: &KeyBuilder, boost: f32) -> StemmedWordFilter { StemmedWordFilter { iter: iter, keypathword: kb.get_keypathword_only(&stemmed_word), + scorer: Scorer::new(stemmed_word, kb, boost), } } } @@ -116,12 +196,8 @@ impl QueryRuntimeFilter for StemmedWordFilter { } fn next_result(&mut self) -> Result, Error> { - if !self.iter.valid() { - return Ok(None) - } - - let key = match self.iter.next() { - Some((key, _value)) => key, + let (key, value) = match self.iter.next() { + Some((key, value)) => (key, value), None => return Ok(None), }; if !key.starts_with(self.keypathword.as_bytes()) { @@ -131,7 +207,23 @@ impl QueryRuntimeFilter for StemmedWordFilter { // We have a candidate document to return let key_str = unsafe{str::from_utf8_unchecked(&key)}; - Ok(Some(KeyBuilder::parse_doc_result_from_key(&key_str))) + let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); + + if self.scorer.should_score() { + let message_reader = ::capnp::serialize_packed::read_message( + &mut &*value, ::capnp::message::ReaderOptions::new()).unwrap(); + let payload = message_reader.get_root::().unwrap(); + + + self.scorer.add_match_score(try!(payload.get_wordinfos()).len(), + payload.get_total_words(), &mut dr); + } + + Ok(Some(dr)) + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.scorer.init(&mut self.iter, &mut qsi); } } @@ -140,18 +232,20 @@ impl QueryRuntimeFilter for StemmedWordFilter { pub struct StemmedWordPosFilter { iter: DBIterator, keypathword: String, + scorer: Scorer, } impl StemmedWordPosFilter { - pub fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder) -> StemmedWordPosFilter { + pub fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder, boost: f32) -> StemmedWordPosFilter { StemmedWordPosFilter{ iter: iter, keypathword: kb.get_keypathword_only(&stemmed_word), + scorer: Scorer::new(&stemmed_word, &kb, boost), } } fn first_result(&mut self, - start: &DocResult) -> Result)>, Error> { + start: &DocResult) -> Result)>, Error> { KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); // Seek in index to >= entry @@ -163,11 +257,7 @@ impl StemmedWordPosFilter { self.next_result() } - fn next_result(&mut self) -> Result)>, Error> { - if !self.iter.valid() { - return Ok(None) - } - + fn next_result(&mut self) -> Result)>, Error> { let (key, value) = match self.iter.next() { Some((key, value)) => (key, value), None => return Ok(None), @@ -176,20 +266,25 @@ impl StemmedWordPosFilter { // we passed the key path we are interested in. nothing left to do */ return Ok(None) } - let mut ref_value = &*value; + let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); + &mut &*value, ::capnp::message::ReaderOptions::new()).unwrap(); let payload = message_reader.get_root::().unwrap(); - let positions = try!(payload.get_wordinfos()) - .iter() - .map(|wi| wi.get_word_pos()as i64) - .collect(); - + let positions: Vec = try!(payload.get_wordinfos()).iter() + .map(|wi| wi.get_word_pos()) + .collect(); + let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let docresult = KeyBuilder::parse_doc_result_from_key(&key_str); + let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); + + self.scorer.add_match_score(positions.len() as u32, payload.get_total_words(), &mut dr); + + Ok(Some((dr, positions))) + } - Ok(Some((docresult, positions))) + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.scorer.init(&mut self.iter, &mut qsi); } } @@ -205,7 +300,7 @@ impl StemmedPhraseFilter { } fn result(&mut self, - base: Option<(DocResult, Vec)>) -> Result, Error> { + base: Option<(DocResult, Vec)>) -> Result, Error> { // this is the number of matches left before all terms match and we can return a result let mut matches_left = self.filters.len() - 1; @@ -277,16 +372,22 @@ impl QueryRuntimeFilter for StemmedPhraseFilter { let base_result = try!(self.filters[0].next_result()); self.result(base_result) } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + for f in self.filters.iter_mut() { + f.prepare_relevancy_scoring(&mut qsi); + } + } } pub struct DistanceFilter { filters: Vec, current_filter: usize, - distance: i64, + distance: u32, } impl DistanceFilter { - pub fn new(filters: Vec, distance: i64) -> DistanceFilter { + pub fn new(filters: Vec, distance: u32) -> DistanceFilter { DistanceFilter { filters: filters, current_filter: 0, @@ -295,7 +396,7 @@ impl DistanceFilter { } fn result(&mut self, - base: Option<(DocResult, Vec)>) -> Result, Error> { + base: Option<(DocResult, Vec)>) -> Result, Error> { // yes this code complex. I tried to break it up, but it wants to be like this. // this is the number of matches left before all terms match and we can return a result @@ -306,7 +407,7 @@ impl DistanceFilter { // This contains tuples of word postions and the filter they came from, // sorted by word position. - let mut base_positions: Vec<(i64, usize)> = positions.iter() + let mut base_positions: Vec<(u32, usize)> = positions.iter() .map(|pos|(*pos, self.current_filter)) .collect(); @@ -324,54 +425,63 @@ impl DistanceFilter { if next.is_none() { return Ok(None); } let (next_result, next_positions) = next.unwrap(); - if base_result == next_result { - // so we are in the same field. Now to check the proximity of the values from the - // next result to previous results. + if base_result != next_result { + // not same field, next_result becomes base_result. + base_result = next_result; + base_positions = next_positions.iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); - // new_positions_map will accept positions within range of pos. But only if all - // positions that can be are within range. We use the sorted map so we can add - // the same positions multiple times and it's a noop. - let mut new_positions_map = BTreeMap::new(); - for &pos in next_positions.iter() { - // coud these lines be any longer? No they could not. - let start = match base_positions.binary_search_by_key(&(pos-dis), - |&(pos2,_)| pos2) { - Ok(start) => start, - Err(start) => start, - }; - - let end = match base_positions.binary_search_by_key(&(pos+dis), + matches_left = self.filters.len() - 1; + continue; + } + // so we are in the same field. Now to check the proximity of the values from the + // next result to previous results. + + // new_positions_map will accept positions within range of pos. But only if all + // positions that can be are within range. We use the sorted map so we can add + // the same positions multiple times and it's a noop. + let mut new_positions_map = BTreeMap::new(); + for &pos in next_positions.iter() { + // coud these lines be any longer? No they could not. + let sub = pos.saturating_sub(dis); // underflows othewises + let start = match base_positions.binary_search_by_key(&(sub), |&(pos2,_)| pos2) { - Ok(end) => end, - Err(end) => end, - }; - - // we now collect all the filters within the range - let mut filters_encountered = HashSet::new(); - for &(_, filter_n) in base_positions[start..end].iter() { - filters_encountered.insert(filter_n); - } - - if filters_encountered.len() == self.filters.len() - matches_left { - // we encountered all the filters we can at this stage, - // so we should add them all to the new_positions_map - for &(prev_pos, filter_n) in base_positions[start..end].iter() { - new_positions_map.insert(prev_pos, filter_n); - } - // and add the current pos - new_positions_map.insert(pos, self.current_filter); + Ok(start) => start, + Err(start) => start, + }; + + let end = match base_positions.binary_search_by_key(&(pos+dis), + |&(pos2,_)| pos2) { + Ok(end) => end, + Err(end) => end, + }; + + // we now collect all the filters within the range + let mut filters_encountered = HashSet::new(); + for &(_, filter_n) in base_positions[start..end].iter() { + filters_encountered.insert(filter_n); + } + + if filters_encountered.len() == self.filters.len() - matches_left { + // we encountered all the filters we can at this stage, + // so we should add them all to the new_positions_map + for &(prev_pos, filter_n) in base_positions[start..end].iter() { + new_positions_map.insert(prev_pos, filter_n); } + // and add the current pos + new_positions_map.insert(pos, self.current_filter); } - if new_positions_map.len() > 0 { - // we have valus that survive! reassign back to positions - base_positions = new_positions_map.into_iter().collect(); - matches_left -= 1; + } + if new_positions_map.len() > 0 { + // we have valus that survive! reassign back to positions + base_positions = new_positions_map.into_iter().collect(); + matches_left -= 1; - if matches_left == 0 { - return Ok(Some(base_result)); - } else { - continue; - } + if matches_left == 0 { + return Ok(Some(base_result)); + } else { + continue; } } // we didn't match on next_result, so get next_result on current filter @@ -399,6 +509,12 @@ impl QueryRuntimeFilter for DistanceFilter { let base_result = try!(self.filters[self.current_filter].next_result()); self.result(base_result) } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + for f in self.filters.iter_mut() { + f.prepare_relevancy_scoring(&mut qsi); + } + } } @@ -440,7 +556,7 @@ impl<'a> AndFilter<'a> { if base_result == next_result { matches_count -= 1; - base_result.combine_bind_name_results(&mut next_result); + base_result.combine(&mut next_result); if matches_count == 0 { return Ok(Some(base_result)); } @@ -462,6 +578,12 @@ impl<'a> QueryRuntimeFilter for AndFilter<'a> { let base_result = try!(self.filters[self.current_filter].next_result()); self.result(base_result) } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + for f in self.filters.iter_mut() { + f.prepare_relevancy_scoring(&mut qsi); + } + } } /// Used by OrFilter to maintain a already fetched result so we don't refetch when one side isn't @@ -548,7 +670,7 @@ impl<'a> OrFilter<'a> { Some(right) }, Ordering::Equal => { - left.combine_bind_name_results(&mut right); + left.combine(&mut right); self.right.result = Some(right); Some(left) }, @@ -582,6 +704,11 @@ impl<'a> QueryRuntimeFilter for OrFilter<'a> { try!(self.right.prime_next_result()); Ok(self.take_smallest()) } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.left.filter.prepare_relevancy_scoring(&mut qsi); + self.right.filter.prepare_relevancy_scoring(&mut qsi); + } } pub struct BindFilter<'a> { @@ -653,5 +780,47 @@ impl<'a> QueryRuntimeFilter for BindFilter<'a> { Ok(None) } } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.filter.prepare_relevancy_scoring(&mut qsi); + } +} + +pub struct BoostFilter<'a> { + filter: Box, + boost: f32, +} + +impl<'a> BoostFilter<'a> { + pub fn new(filter: Box, boost: f32) -> BoostFilter { + BoostFilter { + filter: filter, + boost: boost, + } + } +} + +impl<'a> QueryRuntimeFilter for BoostFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + if let Some(mut dr) = try!(self.filter.first_result(&start)) { + dr.boost_scores(self.boost); + Ok(Some(dr)) + } else { + Ok(None) + } + } + + fn next_result(&mut self) -> Result, Error> { + if let Some(mut dr) = try!(self.filter.next_result()) { + dr.boost_scores(self.boost); + Ok(Some(dr)) + } else { + Ok(None) + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.filter.prepare_relevancy_scoring(&mut qsi); + } } diff --git a/src/index.rs b/src/index.rs index db7fbf2..4f6cd93 100644 --- a/src/index.rs +++ b/src/index.rs @@ -2,9 +2,12 @@ extern crate rocksdb; use std::collections::HashMap; use std::str; +use std::mem; use records_capnp::header; +use rocksdb::MergeOperands; + use error::Error; use json_shred::{Shredder}; @@ -64,7 +67,6 @@ impl Index { //fn open(&mut self, name: &str, open_options: Option) -> Result { pub fn open(&mut self, name: &str, open_options: Option) -> Result<(), Error> { let mut rocks_options = rocksdb::Options::default(); - println!("still here1"); let rocks = match rocksdb::DB::open(&rocks_options, name) { Ok(rocks) => rocks, Err(error) => { @@ -75,6 +77,8 @@ impl Index { rocks_options.create_if_missing(true); rocks_options.set_comparator("noise", Index::compare_keys); + rocks_options.set_merge_operator("noise", Index::sum_merge); + let rocks = try!(rocksdb::DB::open(&rocks_options, name)); @@ -187,6 +191,40 @@ impl Index { } } } + + pub fn convert_bytes_to_u64(bytes: &[u8]) -> u64 { + debug_assert!(bytes.len() == 8); + let mut buffer = [0; 8]; + for (n, b) in bytes.iter().enumerate() { + buffer[n] = *b; + } + unsafe{ mem::transmute(buffer) } + } + + pub fn convert_u64_to_bytes(val: u64) -> [u8; 8] { + unsafe{ mem::transmute(val) } + } + + fn sum_merge(new_key: &[u8], + existing_val: Option<&[u8]>, + operands: &mut MergeOperands) + -> Vec { + if !(new_key[0] as char == 'F' || new_key[0] as char == 'K') { + panic!("unknown key type to merge!"); + } + + let mut count:u64 = if let Some(bytes) = existing_val { + Index::convert_bytes_to_u64(&bytes) + } else { + 0 + }; + + for bytes in operands { + count += Index::convert_bytes_to_u64(&bytes); + } + + Index::convert_u64_to_bytes(count).into_iter().map(|b| *b).collect() + } } diff --git a/src/json_shred.rs b/src/json_shred.rs index bedf7bd..402ab92 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -21,14 +21,14 @@ use stems::Stems; #[derive(Debug, PartialEq)] struct WordInfo { //offset in the text field where the stemmed text starts - word_pos: u64, + word_pos: u32, + + // the start of the suffixText + suffix_offset: u32, // the suffix of the stemmed text. When applied over stemmed, the original // text is returned. suffix_text: String, - - // the start of the suffixText - suffix_offset: u64, } type ArrayOffsets = Vec; @@ -53,7 +53,7 @@ pub trait Indexable { pub struct Shredder { kb: KeyBuilder, // Top-level fields prefixed with an underscore are ignored - ignore_children: u64, + ignore_children: usize, doc_id: String, } @@ -71,20 +71,26 @@ impl Shredder { Result<(), Error> { let stems = Stems::new(text.as_str()); let mut word_to_word_infos = HashMap::new(); + let mut total_words = 0; for stem in stems { let word_infos = word_to_word_infos.entry(stem.stemmed).or_insert(Vec::new()); + total_words += 1; word_infos.push(WordInfo{ - word_pos: stem.word_pos as u64, + word_pos: stem.word_pos, suffix_text: stem.suffix.to_string(), - suffix_offset: stem.suffix_offset as u64, + suffix_offset: stem.suffix_offset, }); } + for (stemmed, word_infos) in word_to_word_infos { let mut message = ::capnp::message::Builder::new_default(); + let count: u32; { - let capn_payload = message.init_root::(); - let mut capn_wordinfos = capn_payload.init_wordinfos(word_infos.len() as u32); + let mut capn_payload = message.init_root::(); + count = word_infos.len() as u32; + capn_payload.set_total_words(total_words); + let mut capn_wordinfos = capn_payload.init_wordinfos(count); for (pos, word_info) in word_infos.iter().enumerate() { let mut capn_wordinfo = capn_wordinfos.borrow().get(pos as u32); capn_wordinfo.set_word_pos(word_info.word_pos); @@ -98,6 +104,14 @@ impl Shredder { let key = self.kb.stemmed_word_key(&stemmed, docseq); try!(batch.put(&key.into_bytes(), &bytes)); + let bytes = unsafe{ transmute::(count as u64) }; + let key = self.kb.keypathword_count_key(&stemmed); + try!(batch.merge(&key.into_bytes(), &bytes)); + + let bytes = unsafe{ transmute::(1) }; + let key = self.kb.keypath_count_key(); + try!(batch.merge(&key.into_bytes(), &bytes)); + } let key = self.kb.value_key(docseq); let mut buffer = String::with_capacity(text.len() + 1); @@ -319,8 +333,9 @@ mod tests { use std::str; use records_capnp; use super::{WordInfo}; + use index::{Index, OpenOptions}; - fn wordinfos_from_rocks(rocks: rocksdb::DB) -> Vec<(String, Vec)> { + fn wordinfos_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, Vec)> { let mut result = Vec::new(); for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { if key[0] as char == 'W' { @@ -353,9 +368,15 @@ mod tests { let mut batch = rocksdb::WriteBatch::default(); shredder.shred(json, docseq, &mut batch).unwrap(); - let rocks = rocksdb::DB::open_default("target/tests/test_shred_netsted").unwrap(); + let dbname = "target/tests/test_shred_netsted"; + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + rocks.write(batch).unwrap(); - let result = wordinfos_from_rocks(rocks); + let result = wordinfos_from_rocks(&rocks); let expected = vec![ ("W.some$!array#123,0".to_string(), vec![ @@ -382,9 +403,15 @@ mod tests { let mut batch = rocksdb::WriteBatch::default(); shredder.shred(json, docseq, &mut batch).unwrap(); - let rocks = rocksdb::DB::open_default("target/tests/test_shred_objects").unwrap(); + let dbname = "target/tests/test_shred_objects"; + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + rocks.write(batch).unwrap(); - let result = wordinfos_from_rocks(rocks); + let result = wordinfos_from_rocks(&rocks); println!("result: {:?}", result); let expected = vec![ ("W.A$.B!b1#1234,1".to_string(), vec![ @@ -416,9 +443,16 @@ mod tests { let mut batch = rocksdb::WriteBatch::default(); shredder.shred(json, docseq, &mut batch).unwrap(); - let rocks = rocksdb::DB::open_default("target/tests/test_shred_empty_object").unwrap(); + let dbname = "target/tests/test_shred_empty_object"; + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let rocks = &index.rocks.unwrap(); + rocks.write(batch).unwrap(); - let result = wordinfos_from_rocks(rocks); + let result = wordinfos_from_rocks(&rocks); assert!(result.is_empty()); } } diff --git a/src/key_builder.rs b/src/key_builder.rs index 0f346b1..00823bb 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -33,6 +33,26 @@ impl KeyBuilder { string } + pub fn keypathword_count_key(&self, word: &str) -> String { + let mut string = String::with_capacity(100); + string.push('F'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('!'); + string.push_str(word); + string + } + + pub fn keypath_count_key(&self) -> String { + let mut string = String::with_capacity(100); + string.push('K'); + for segment in &self.keypath { + string.push_str(&segment); + } + string + } + /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath /// built up internally. pub fn stemmed_word_key(&self, word: &str, seq: u64) -> String { diff --git a/src/parser.rs b/src/parser.rs index 9e715f1..e0ab8db 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,4 +1,5 @@ +use std; use std::str; use std::collections::HashMap; use std::iter::Iterator; @@ -8,10 +9,10 @@ use error::Error; use key_builder::KeyBuilder; use stems::Stems; use json_value::JsonValue; -use query::{Sort, Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, AggregateFun, - SortInfo}; +use query::{Sort, Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore, + AggregateFun, SortInfo, SortField}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, - StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter}; + StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs @@ -23,6 +24,7 @@ pub struct Parser<'a> { offset: usize, kb: KeyBuilder, pub snapshot: Snapshot<'a>, + pub needs_scoring: bool, } impl<'a> Parser<'a> { @@ -32,6 +34,7 @@ impl<'a> Parser<'a> { offset: 0, kb: KeyBuilder::new(), snapshot: snapshot, + needs_scoring: false, } } @@ -254,6 +257,29 @@ impl<'a> Parser<'a> { Ok(Some(kb)) } + // if no boost is specified returns 1.0 + fn consume_boost(&mut self) -> Result { + if self.consume("^") { + if let Some(num) = try!(self.consume_number()) { + Ok(num as f32) + } else { + return Err(Error::Parse("Expected number after ^ symbol.".to_string())); + } + } else { + Ok(1.0) + } + } + + fn consume_boost_and_wrap_filter(&mut self, filter: Box) + -> Result, Error> { + let boost = try!(self.consume_boost()); + if boost != 1.0 { + Ok(Box::new(BoostFilter::new(filter, boost))) + } else { + Ok(filter) + } + } + fn consume_number(&mut self) -> Result, Error> { // Yes this parsing code is hideously verbose. But it conforms exactly to the json spec // and uses the rust f64 parser, which can't tell us how many characters it used or needs. @@ -544,9 +570,11 @@ ws1 fn object<'b>(&'b mut self) -> Result, Error> { if self.consume("{") { - let left = try!(self.obool()); + let mut left = try!(self.obool()); try!(self.must_consume("}")); + left = try!(self.consume_boost_and_wrap_filter(left)); + if self.consume("&&") { let right = try!(self.object()); Ok(Box::new(AndFilter::new(vec![left, right], self.kb.arraypath_len()))) @@ -566,7 +594,8 @@ ws1 try!(self.must_consume("(")); let filter = try!(self.object()); try!(self.must_consume(")")); - Ok(filter) + + self.consume_boost_and_wrap_filter(filter) } fn obool<'b>(&'b mut self) -> Result, Error> { @@ -605,32 +634,39 @@ ws1 } fn oparens<'b>(&'b mut self) -> Result>, Error> { - if self.consume("(") { + let opt_filter = if self.consume("(") { let f = try!(self.obool()); try!(self.must_consume(")")); - Ok(Some(f)) + Some(f) } else if self.could_consume("[") { - Ok(Some(try!(self.array()))) + Some(try!(self.array())) } else if self.could_consume("{") { - Ok(Some(try!(self.object()))) + Some(try!(self.object())) } else { if let Some(filter) = try!(self.bind_var()) { - Ok(Some(filter)) + Some(filter) } else { - Ok(None) + None } + }; + + if let Some(filter) = opt_filter { + Ok(Some(try!(self.consume_boost_and_wrap_filter(filter)))) + } else { + Ok(None) } } fn compare<'b>(&'b mut self) -> Result, Error> { if self.consume("==") { let literal = try!(self.must_consume_string_literal()); + let boost = try!(self.consume_boost()); let stems = Stems::new(&literal); let mut filters: Vec> = Vec::new(); for stem in stems { let iter = self.snapshot.iterator(IteratorMode::Start); let filter = Box::new(ExactMatchFilter::new( - iter, &stem, &self.kb)); + iter, &stem, &self.kb, boost)); filters.push(filter); } match filters.len() { @@ -641,6 +677,7 @@ ws1 } else if self.consume("~=") { // regular search let literal = try!(self.must_consume_string_literal()); + let boost = try!(self.consume_boost()); let stems = Stems::new(&literal); let stemmed_words: Vec = stems.map(|stem| stem.stemmed).collect(); @@ -648,13 +685,14 @@ ws1 0 => panic!("Cannot create a StemmedWordFilter"), 1 => { let iter = self.snapshot.iterator(IteratorMode::Start); - Ok(Box::new(StemmedWordFilter::new(iter, &stemmed_words[0], &self.kb))) + Ok(Box::new(StemmedWordFilter::new(iter, &stemmed_words[0], &self.kb, boost))) }, _ => { let mut filters: Vec = Vec::new(); for stemmed_word in stemmed_words { let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = StemmedWordPosFilter::new(iter, &stemmed_word, &self.kb); + let filter = StemmedWordPosFilter::new(iter, &stemmed_word, + &self.kb, boost); filters.push(filter); } Ok(Box::new(StemmedPhraseFilter::new(filters))) @@ -670,17 +708,21 @@ ws1 try!(self.must_consume("=")); let literal = try!(self.must_consume_string_literal()); + let boost = try!(self.consume_boost()); let stems = Stems::new(&literal); let mut filters: Vec = Vec::new(); for stem in stems { let iter = self.snapshot.iterator(IteratorMode::Start); let filter = StemmedWordPosFilter::new( - iter, &stem.stemmed, &self.kb); + iter, &stem.stemmed, &self.kb, boost); filters.push(filter); } + if word_distance > std::u32::MAX as i64 { + return Err(Error::Parse("Proximity search number too large.".to_string())); + } match filters.len() { 0 => panic!("Cannot create a DistanceFilter"), - _ => Ok(Box::new(DistanceFilter::new(filters, word_distance))), + _ => Ok(Box::new(DistanceFilter::new(filters, word_distance as u32))), } } else { Err(Error::Parse("Expected comparison operator".to_string())) @@ -712,20 +754,26 @@ ws1 } fn aparens<'b>(&'b mut self) -> Result>, Error> { - if self.consume("(") { + let opt_filter = if self.consume("(") { let f = try!(self.abool()); try!(self.must_consume(")")); - Ok(Some(f)) + Some(f) } else if self.could_consume("[") { - Ok(Some(try!(self.array()))) + Some(try!(self.array())) } else if self.could_consume("{") { - Ok(Some(try!(self.object()))) + Some(try!(self.object())) } else { if let Some(filter) = try!(self.bind_var()) { - Ok(Some(filter)) + Some(filter) } else { - Ok(None) + None } + }; + + if let Some(filter) = opt_filter { + Ok(Some(try!(self.consume_boost_and_wrap_filter(filter)))) + } else { + Ok(None) } } @@ -753,7 +801,8 @@ ws1 let filter = try!(self.abool()); self.kb.pop_array(); try!(self.must_consume("]")); - Ok(filter) + + self.consume_boost_and_wrap_filter(filter) } pub fn sort_clause(&mut self) -> Result, Error> { @@ -790,12 +839,31 @@ ws1 sort }; - sort_infos.insert(kb.value_key(0), SortInfo{kb:kb, - sort:sort, - default:default}); - if !self.consume(",") { - break; - } + sort_infos.insert(kb.value_key(0), SortInfo{field: SortField::FetchValue(kb), + sort: sort, + default: default}); + } else { + try!(self.must_consume("score")); + try!(self.must_consume("(")); + try!(self.must_consume(")")); + + self.needs_scoring = true; + + let sort = if self.consume("asc") { + Sort::Asc + } else if self.consume("desc") { + Sort::Desc + } else { + Sort::Asc + }; + + sort_infos.insert("score()".to_string(), + SortInfo{field: SortField::Score, + sort: sort, default: JsonValue::Null}); + } + + if !self.consume(",") { + break; } } if sort_infos.is_empty() { @@ -867,6 +935,17 @@ ws1 return Ok(Some(Box::new(RetLiteral{json: JsonValue::False}))); } else if self.consume("null") { return Ok(Some(Box::new(RetLiteral{json: JsonValue::Null}))); + } else if self.could_consume("score") { + let offset = self.offset; + let _ = self.consume("score"); + if self.consume("(") { + try!(self.must_consume(")")); + self.needs_scoring = true; + return Ok(Some(Box::new(RetScore{sort: None}))); + } else { + //wasn't the score, maybe it's a bind variable + self.offset = offset; + } } if let Some((ag, bind_name_option, kb, json)) = try!(self.consume_aggregate()) { @@ -1007,6 +1086,17 @@ ws1 self.ws(); Ok(try!(self.find())) } + + pub fn non_ws_left(&mut self) -> Result<(), Error> { + self.ws(); + if self.offset != self.query.len() { + Err(Error::Parse(format!("At character {} unexpected {}.", + self.offset, + &self.query[self.offset..]))) + } else { + Ok(()) + } + } } #[cfg(test)] diff --git a/src/query.rs b/src/query.rs index b1de97d..3c5f3ff 100644 --- a/src/query.rs +++ b/src/query.rs @@ -25,6 +25,7 @@ pub struct DocResult { pub seq: u64, pub arraypath: Vec, pub bind_name_result: HashMap>, + pub scores: Vec<(f32, usize)>, // (sum of score, num matches of term) } impl DocResult { @@ -33,6 +34,7 @@ impl DocResult { seq: 0, arraypath: Vec::new(), bind_name_result: HashMap::new(), + scores: Vec::new(), } } @@ -44,7 +46,7 @@ impl DocResult { self.bind_name_result.insert(bind_name.to_string(), vec![result_key]); } - pub fn combine_bind_name_results(&mut self, other: &mut DocResult) { + pub fn combine(&mut self, other: &mut DocResult) { let mut replace = HashMap::new(); swap(&mut replace, &mut other.bind_name_result); for (bind_name, mut result_keys_other) in replace.into_iter() { @@ -54,6 +56,21 @@ impl DocResult { } self.bind_name_result.insert(bind_name, result_keys_other); } + self.scores.append(&mut other.scores); + } + + pub fn add_score(&mut self, term_ordinal: usize, score: f32) { + if term_ordinal >= self.scores.len() { + self.scores.resize(term_ordinal + 1, (0.0, 0)); + } + self.scores[term_ordinal].0 += score; + self.scores[term_ordinal].1 += 1; + } + + pub fn boost_scores(&mut self, boost: f32) { + for &mut (ref mut score, ref mut _num_match) in self.scores.iter_mut() { + *score *= boost; + } } } @@ -85,123 +102,89 @@ impl Ord for DocResult { } } +pub struct QueryScoringInfo { + pub num_terms: usize, + pub sum_of_idt_sqs: f32, +} pub struct Query {} impl Query { pub fn get_matches<'a>(query: String, index: &'a Index) -> Result, Error> { - match index.rocks { - Some(ref rocks) => { - let snapshot = Snapshot::new(&rocks); - let mut parser = Parser::new(query, snapshot); - let filter = try!(parser.build_filter()); - let mut sorts = try!(parser.sort_clause()); - let mut returnable = try!(parser.return_clause()); - let limit = try!(parser.limit_clause()); - - let mut ags = Vec::new(); - returnable.get_aggregate_funs(&mut ags); - - let mut has_ags = false; - for option_ag in ags.iter() { - if option_ag.is_some() { - has_ags = true; - break; + if index.rocks.is_none() { + return Err(Error::Parse("You must open the index first".to_string())); + } + + let snapshot = Snapshot::new(&index.rocks.as_ref().unwrap()); + let mut parser = Parser::new(query, snapshot); + let mut filter = try!(parser.build_filter()); + let mut sorts = try!(parser.sort_clause()); + let mut returnable = try!(parser.return_clause()); + let limit = try!(parser.limit_clause()); + try!(parser.non_ws_left()); + + let mut ags = Vec::new(); + returnable.get_aggregate_funs(&mut ags); + + let mut has_ags = false; + for option_ag in ags.iter() { + if option_ag.is_some() { + has_ags = true; + break; + } + } + let has_sorting = !sorts.is_empty(); + + returnable = if has_sorting && has_ags { + return Err(Error::Parse("Cannot have aggregates and sorting in the same query" + .to_string())); + } else if has_sorting { + returnable.take_sort_for_matching_fields(&mut sorts); + if !sorts.is_empty() { + let mut vec: Vec> = Vec::new(); + for (_key, sort_info) in sorts.into_iter() { + match sort_info.field { + SortField::FetchValue(kb) => { + vec.push(Box::new(RetValue{ kb: kb, + ag: None, + default: sort_info.default, + sort: Some(sort_info.sort)})); + }, + SortField::Score => { + vec.push(Box::new(RetScore{ sort: Some(sort_info.sort)})); + }, } } - let has_sorting = !sorts.is_empty(); - - returnable = if has_sorting && has_ags { - return Err(Error::Parse("Cannot have aggregates and sorting in the same query" - .to_string())); - } else if has_sorting { - returnable.take_sort_for_matching_fields(&mut sorts); - if !sorts.is_empty() { - let vec = sorts.into_iter() - .map(|(_key, sort_info)| - RetValue {kb: sort_info.kb, - ag: None, - default: sort_info.default, - sort: Some(sort_info.sort)}) - .collect(); - Box::new(RetHidden{unrendered: vec, visible: returnable}) - } else { - returnable - } - } else { - returnable - }; - - let option_ags = if has_ags { - // we have at least one AggregationFun. Make sure they are all set. - for option_ag in ags.iter() { - if option_ag.is_none() { - return Err(Error::Parse("Return keypaths must either all have \ - aggregate functions, or none can them.".to_string())); - } - } - Some(ags.into_iter().map(|option| option.unwrap()).collect()) - } else { - None - }; + Box::new(RetHidden{unrendered: vec, visible: returnable}) + } else { + returnable + } + } else { + returnable + }; - let sorting = if has_sorting { - let mut sorting = Vec::new(); - returnable.get_sorting(&mut sorting); - Some(sorting) - } else { - None - }; - - Ok(QueryResults::new(filter, parser.snapshot, returnable, - option_ags, sorting, limit)) - }, - None => { - Err(Error::Parse("You must open the index first".to_string())) - }, + if has_ags { + // we have at least one AggregationFun. Make sure they are all set. + for option_ag in ags.iter() { + if option_ag.is_none() { + return Err(Error::Parse("Return keypaths must either all have \ + aggregate functions, or none can them.".to_string())); + } + } } - } -} -pub struct QueryResults<'a> { - filter: Box, - doc_result_next: DocResult, - snapshot: Snapshot<'a>, - iter: DBIterator, - returnable: Box, - buffer: Vec, - needs_sorting_and_ags: bool, - done_with_sorting_and_ags: bool, - does_group_or_aggr: bool, - sorts: Option>, - aggr_inits: Vec<(fn (&mut JsonValue), usize)>, - aggr_actions: Vec<(fn (&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, - aggr_finals: Vec<(fn (&mut JsonValue), usize)>, - in_buffer: Vec>, - sorted_buffer: Vec>, - limit: usize, -} + let needs_sorting_and_ags = has_ags || has_sorting; -impl<'a> QueryResults<'a> { - fn new(filter: Box, - snapshot: Snapshot<'a>, - returnable: Box, - ags: Option>, - sorting: Option>>, - limit: usize) -> QueryResults<'a> { - // the input args for sorts and ags are vecs where the slot is the same slot as // a result that the action needs to be applied to. We instead convert them // into several new fields with tuples of action and the slot to act on. // this way we don't needlesss loop over the actions where most are noops - // only one can be Some at a time - debug_assert!(!sorting.is_some() && !ags.is_some() || sorting.is_some() ^ ags.is_some()); - let needs_sorting_and_ags = ags.is_some() || sorting.is_some(); - let mut sorts = Vec::new(); - if let Some(mut sorting) = sorting { + if has_sorting { + let mut sorting = Vec::new(); + returnable.get_sorting(&mut sorting); let mut n = sorting.len(); while let Some(option) = sorting.pop() { n -= 1; @@ -212,14 +195,16 @@ impl<'a> QueryResults<'a> { // order we process sorts is important sorts.reverse(); } + + let mut does_group_or_aggr = false; let mut aggr_inits = Vec::new(); let mut aggr_actions = Vec::new(); let mut aggr_finals = Vec::new(); - if let Some(mut ags) = ags { + if has_ags { does_group_or_aggr = true; let mut n = ags.len(); - while let Some((ag, user_arg)) = ags.pop() { + while let Some(Some((ag, user_arg))) = ags.pop() { n -= 1; if ag == AggregateFun::GroupAsc { sorts.push((Sort::Asc, n)); @@ -240,11 +225,23 @@ impl<'a> QueryResults<'a> { sorts.reverse(); } - QueryResults{ + let mut qsi = QueryScoringInfo{num_terms: 0, sum_of_idt_sqs: 0.0}; + + if parser.needs_scoring { + filter.prepare_relevancy_scoring(&mut qsi); + } + + let query_norm = if qsi.num_terms > 0 { + 1.0/(qsi.sum_of_idt_sqs as f32) + } else { + 0.0 + }; + + Ok(QueryResults { filter: filter, doc_result_next: DocResult::new(), - iter: snapshot.iterator(IteratorMode::Start), - snapshot: snapshot, + iter: parser.snapshot.iterator(IteratorMode::Start), + snapshot: parser.snapshot, returnable: returnable, buffer: Vec::new(), needs_sorting_and_ags: needs_sorting_and_ags, @@ -257,7 +254,49 @@ impl<'a> QueryResults<'a> { in_buffer: Vec::new(), sorted_buffer: Vec::new(), limit: limit, + scoring_num_terms: qsi.num_terms, + scoring_query_norm: query_norm, + }) + } +} + +pub struct QueryResults<'a> { + filter: Box, + doc_result_next: DocResult, + snapshot: Snapshot<'a>, + iter: DBIterator, + returnable: Box, + buffer: Vec, + needs_sorting_and_ags: bool, + done_with_sorting_and_ags: bool, + does_group_or_aggr: bool, + sorts: Option>, + aggr_inits: Vec<(fn (&mut JsonValue), usize)>, + aggr_actions: Vec<(fn (&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, + aggr_finals: Vec<(fn (&mut JsonValue), usize)>, + in_buffer: Vec>, + sorted_buffer: Vec>, + limit: usize, + scoring_num_terms: usize, + scoring_query_norm: f32, +} + +impl<'a> QueryResults<'a> { + + fn compute_relevancy_score(& self, dr: &DocResult) -> f32 { + if self.scoring_num_terms == 0 { + return 0.0 + } + let mut num_terms_matched = 0; + let mut score: f32 = 0.0; + for &(ref total_term_score, ref num_times_term_matched) in dr.scores.iter() { + if *num_times_term_matched > 0 { + score += *total_term_score/(*num_times_term_matched as f32); + num_terms_matched += 1; + } } + self.scoring_query_norm * score * (num_terms_matched as f32) + / (self.scoring_num_terms as f32) } fn get_next_result(&mut self) -> Result, Error> { @@ -307,8 +346,9 @@ impl<'a> QueryResults<'a> { }; match next { Some(dr) => { + let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - try!(self.returnable.fetch_result(&mut self.iter, dr.seq, + try!(self.returnable.fetch_result(&mut self.iter, dr.seq, score, &dr.bind_name_result, &mut results)); self.in_buffer.push(results); if self.in_buffer.len() == self.limit { @@ -344,8 +384,9 @@ impl<'a> QueryResults<'a> { Some(dr) => dr, None => return Ok(None), }; + let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - try!(self.returnable.fetch_result(&mut self.iter, dr.seq, + try!(self.returnable.fetch_result(&mut self.iter, dr.seq, score, &dr.bind_name_result, &mut results)); self.buffer.clear(); try!(self.returnable.write_result(&mut results, &mut self.buffer)); @@ -732,15 +773,20 @@ pub enum Sort { Desc, } +pub enum SortField { + FetchValue(KeyBuilder), + Score, +} + pub struct SortInfo { - pub kb: KeyBuilder, + pub field: SortField, pub sort: Sort, pub default: JsonValue, } pub trait Returnable { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error>; @@ -759,11 +805,11 @@ pub struct RetObject { } impl Returnable for RetObject { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { for &(ref _key, ref field) in self.fields.iter() { - try!(field.fetch_result(iter, seq, bind_var_keys, result)); + try!(field.fetch_result(iter, seq, score, bind_var_keys, result)); } Ok(()) } @@ -814,11 +860,11 @@ pub struct RetArray { } impl Returnable for RetArray { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { for ref slot in self.slots.iter() { - try!(slot.fetch_result(iter, seq, bind_var_keys, result)); + try!(slot.fetch_result(iter, seq, score, bind_var_keys, result)); } Ok(()) } @@ -863,19 +909,19 @@ impl Returnable for RetArray { pub struct RetHidden { - unrendered: Vec, + unrendered: Vec>, visible: Box, } impl Returnable for RetHidden { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { for ref mut unrendered in self.unrendered.iter() { - try!(unrendered.fetch_result(iter, seq, bind_var_keys, result)); + try!(unrendered.fetch_result(iter, seq, score, bind_var_keys, result)); } - self.visible.fetch_result(iter, seq, bind_var_keys, result) + self.visible.fetch_result(iter, seq, score, bind_var_keys, result) } fn get_aggregate_funs(&self, funs: &mut Vec>) { @@ -909,7 +955,7 @@ pub struct RetLiteral { } impl Returnable for RetLiteral { - fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, + fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, _score: f32, _bind_var_keys: &HashMap>, _result: &mut VecDeque) -> Result<(), Error> { Ok(()) @@ -1077,7 +1123,7 @@ impl RetValue { } impl Returnable for RetValue { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, _bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { @@ -1146,7 +1192,7 @@ pub struct RetBind { impl Returnable for RetBind { - fn fetch_result(&self, iter: &mut DBIterator, _seq: u64, + fn fetch_result(&self, iter: &mut DBIterator, _seq: u64, _score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { @@ -1207,6 +1253,44 @@ impl Returnable for RetBind { } +pub struct RetScore { + pub sort: Option, +} + + +impl Returnable for RetScore { + fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, score: f32, + _bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + result.push_back(JsonValue::Number(score as f64)); + Ok(()) + } + + fn get_aggregate_funs(&self, _funs: &mut Vec>) { + // noop + } + + fn get_sorting(&self, sorts: &mut Vec>) { + sorts.push(self.sort.clone()); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + if let Some(sort_info) = map.remove("score()") { + self.sort = Some(sort_info.sort); + } + } + + fn write_result(&self, results: &mut VecDeque, + write: &mut Write) -> Result<(), Error> { + if let Some(json) = results.pop_front() { + try!(json.render(write)); + } else { + panic!("missing result!"); + } + Ok(()) + } +} + #[cfg(test)] mod tests { extern crate rustc_serialize; @@ -1331,6 +1415,10 @@ mod tests { assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); + query_results = Query::get_matches(r#"find {A: ~10= "a bunch of words sentence"}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + query_results = Query::get_matches(r#"find {A: == ""}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("11".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); @@ -1773,6 +1861,171 @@ mod tests { } } + #[test] + fn test_query_score() { + let dbname = "target/tests/querytestscore"; + + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + assert_eq!(Ok(()), index.add(r#"{"_id":"1", "bar": "fox"}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"2", "bar": "quick fox"}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"3", "bar": "quick brown fox"}"#)); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} + sort score() desc + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""3""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""2""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + + { + let mut query_results = Query::get_matches(r#"find {bar: ~="quick brown fox"} + sort score() desc + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""3""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + + { + //score boosting + let mut query_results = Query::get_matches(r#"find {bar: ~="quick brown fox"} + return score() "#.to_string(), &index).unwrap(); + + + let mut query_results2 = Query::get_matches(r#"find {bar: ~="quick brown fox"^2} + return score() "#.to_string(), &index).unwrap(); + + + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: =="quick brown fox"} + return score() "#.to_string(), &index).unwrap(); + + + let mut query_results2 = Query::get_matches(r#"find {bar: =="quick brown fox"^2} + return score() "#.to_string(), &index).unwrap(); + + + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: ~2="quick brown fox"} + return score() "#.to_string(), &index).unwrap(); + + + let mut query_results2 = Query::get_matches(r#"find {bar: ~2="quick brown fox"^2} + return score() "#.to_string(), &index).unwrap(); + + + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + let mut query_results2 = Query::get_matches(r#"find ({bar: ~="fox" || bar: ~="brown" || bar: ~="quick"})^2 + sort score() desc + return score() "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + let mut query_results2 = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"}^2 + sort score() desc + return score() "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + let mut query_results2 = Query::get_matches(r#"find {bar: ~="fox"^2 || (bar: ~="brown" || bar: ~="quick")^2 } + sort score() desc + return score() "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + let mut query_results2 = Query::get_matches(r#"find {bar: ~="fox"}^2 || {bar: ~="brown" || bar: ~="quick"}^2 + sort score() desc + return score() "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + assert_eq!(Ok(()), index.add(r#"{"_id":"4", "bar": ["fox"]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"5", "bar": ["quick fox"]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"6", "bar": ["quick brown fox"]}"#)); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {bar:[ ~="fox" || ~="brown" || ~="quick"]} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + let mut query_results2 = Query::get_matches(r#"find {bar:[~="fox" || ~="brown" || ~="quick"]^2} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + { + let mut query_results = Query::get_matches(r#"find {bar:[ ~="fox" || ~="brown" || ~="quick"]} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + let mut query_results2 = Query::get_matches(r#"find {bar:[~="fox"]^2 || bar:[~="brown" || ~="quick"]^2} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + { + let mut query_results = Query::get_matches(r#"find {bar:[ ~="fox" || ~="brown" || ~="quick"]} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + let mut query_results2 = Query::get_matches(r#"find {bar:[~="fox"]^2 || (bar:[~="brown"] || bar:[~="quick"])^2} + sort score() desc + return score() "#.to_string(), &index).unwrap(); + assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, + query_results2.next_result().unwrap().unwrap().parse::().unwrap()); + } + + } #[test] fn test_query_more_docs() { diff --git a/src/stems.rs b/src/stems.rs index 94a867a..48a37ac 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -18,9 +18,9 @@ pub struct Stems<'a> { #[derive(Debug, PartialEq)] pub struct StemmedWord { // Where the stemmed word starts - pub word_pos: usize, + pub word_pos: u32, // Where the suffix starts - pub suffix_offset: usize, + pub suffix_offset: u32, // The stemmed word pub stemmed: String, // The difference between the stemmed word and the original lowercased one. It can be @@ -96,7 +96,7 @@ impl<'a> Iterator for Stems<'a> { self.word_position += 1; return Some(StemmedWord { word_pos: 0, - suffix_offset: word_to_stem.len(), + suffix_offset: word_to_stem.len() as u32, stemmed: word_to_stem, suffix: String::new(), }); @@ -123,8 +123,8 @@ impl<'a> Iterator for Stems<'a> { let stemmed = self.stemmer.stem(&word_to_stem.to_lowercase()); let prefix_len = Stems::common_prefix_len(&stemmed, &suffix); let ret = StemmedWord { - word_pos: self.word_position, - suffix_offset: prefix_len, + word_pos: self.word_position as u32, + suffix_offset: prefix_len as u32, stemmed: stemmed, suffix: (&suffix[prefix_len..]).to_string(), }; From c33d06b84678c6bb2749a5dbbf24faf24cce8e06 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 12 Jan 2017 18:29:51 -0800 Subject: [PATCH 054/122] Logical Not (!) support Use ! in front of comparison operators, arrays, objects or parens to negate match clauses. Query must not be made up of only negated clauses, and queries cannot have negated clauses nested in other negations. --- Cargo.toml | 2 +- src/filters.rs | 154 ++++++++++++++++++++++++++++++- src/key_builder.rs | 56 +++++++++--- src/parser.rs | 41 ++++++++- src/query.rs | 219 +++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 435 insertions(+), 37 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 36b4bc3..fb39757 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ stemmer = "0.3.2" unicode-normalization = "0.1.2" unicode-segmentation = "0.1.2" rocksdb = "0.5.0" - +backtrace = "0.2.0" [build-dependencies] capnpc = "0.7.2" diff --git a/src/filters.rs b/src/filters.rs index 0f19f40..db38c24 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -87,6 +87,12 @@ pub trait QueryRuntimeFilter { fn first_result(&mut self, start: &DocResult) -> Result, Error>; fn next_result(&mut self) -> Result, Error>; fn prepare_relevancy_scoring(&mut self, qsi: &mut QueryScoringInfo); + + /// returns error is a double negation is detected + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error>; + + /// return true if filter or all subfilters are NotFilters + fn is_all_not(&self) -> bool; } pub struct ExactMatchFilter { @@ -163,6 +169,14 @@ impl QueryRuntimeFilter for ExactMatchFilter { fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { self.scorer.init(&mut self.iter, &mut qsi); } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } } pub struct StemmedWordFilter { @@ -225,6 +239,14 @@ impl QueryRuntimeFilter for StemmedWordFilter { fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { self.scorer.init(&mut self.iter, &mut qsi); } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } } /// This is not a QueryRuntimeFilter but it imitates one. Instead of returning just a DocResult @@ -378,6 +400,14 @@ impl QueryRuntimeFilter for StemmedPhraseFilter { f.prepare_relevancy_scoring(&mut qsi); } } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } } pub struct DistanceFilter { @@ -515,6 +545,14 @@ impl QueryRuntimeFilter for DistanceFilter { f.prepare_relevancy_scoring(&mut qsi); } } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } } @@ -584,6 +622,22 @@ impl<'a> QueryRuntimeFilter for AndFilter<'a> { f.prepare_relevancy_scoring(&mut qsi); } } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + for f in self.filters.iter() { + try!(f.check_double_not(parent_is_neg)); + } + Ok(()) + } + + fn is_all_not(&self) -> bool { + for f in self.filters.iter() { + if !f.is_all_not() { + return false; + } + } + true + } } /// Used by OrFilter to maintain a already fetched result so we don't refetch when one side isn't @@ -603,7 +657,7 @@ impl<'a> FilterWithResult<'a> { } if self.result.is_none() { self.result = try!(self.filter.first_result(start)); - } else if self.result.as_ref().unwrap() < start { + } else if self.result.as_ref().unwrap().less(start, self.array_depth) { self.result = try!(self.filter.first_result(start)); } if self.result.is_none() { @@ -650,7 +704,7 @@ impl<'a> OrFilter<'a> { result: None, array_depth: array_depth, is_done: false, - } + }, } } fn take_smallest(&mut self) -> Option { @@ -709,11 +763,86 @@ impl<'a> QueryRuntimeFilter for OrFilter<'a> { self.left.filter.prepare_relevancy_scoring(&mut qsi); self.right.filter.prepare_relevancy_scoring(&mut qsi); } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + try!(self.left.filter.check_double_not(parent_is_neg)); + try!(self.right.filter.check_double_not(parent_is_neg)); + Ok(()) + } + + fn is_all_not(&self) -> bool { + if self.left.filter.is_all_not() && self.right.filter.is_all_not() { + true + } else { + false + } + } +} + + +pub struct NotFilter<'a> { + filter: Box, + last_doc_returned: Option, + array_depth: usize, +} + +impl<'a> NotFilter<'a> { + pub fn new(filter: Box, array_depth: usize) -> NotFilter { + NotFilter { + filter: filter, + last_doc_returned: Some(DocResult::new()), + array_depth: array_depth, + } + } +} + +impl<'a> QueryRuntimeFilter for NotFilter<'a> { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + let mut start = start.clone_only_seq_and_arraypath(); + while let Some(dr) = try!(self.filter.first_result(&start)) { + if start.less(&dr, self.array_depth) { + self.last_doc_returned = Some(start.clone_only_seq_and_arraypath()); + return Ok(Some(start.clone_only_seq_and_arraypath())); + } + start.increment_last(self.array_depth); + } + self.last_doc_returned = None; + Ok(Some(start)) + } + + fn next_result(&mut self) -> Result, Error> { + let next = if let Some(ref last_doc_returned) = self.last_doc_returned { + let mut next = last_doc_returned.clone_only_seq_and_arraypath(); + next.increment_last(self.array_depth); + next + } else { + return Ok(None); + }; + self.first_result(&next) + } + + fn prepare_relevancy_scoring(&mut self, _qsi: &mut QueryScoringInfo) { + // no op + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + if parent_is_neg { + return Err(Error::Parse("Logical not (\"!\") is nested inside of another logical not. \ + This is not allowed.".to_string())); + } + try!(self.filter.check_double_not(true)); + Ok(()) + } + + fn is_all_not(&self) -> bool { + true + } } pub struct BindFilter<'a> { bind_var_name: String, filter: Box, + array_depth: usize, kb: KeyBuilder, option_next: Option, } @@ -725,7 +854,8 @@ impl<'a> BindFilter<'a> { kb: KeyBuilder) -> BindFilter { BindFilter { bind_var_name: bind_var_name, - filter: filter, + filter: filter, + array_depth: kb.arraypath_len(), kb: kb, option_next: None, } @@ -751,7 +881,7 @@ impl<'a> BindFilter<'a> { impl<'a> QueryRuntimeFilter for BindFilter<'a> { fn first_result(&mut self, start: &DocResult) -> Result, Error> { let first = if let Some(next) = self.option_next.take() { - if next >= *start { + if start.less(&next, self.array_depth) { Some(next) } else { try!(self.filter.first_result(&start)) @@ -784,6 +914,14 @@ impl<'a> QueryRuntimeFilter for BindFilter<'a> { fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { self.filter.prepare_relevancy_scoring(&mut qsi); } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + self.filter.check_double_not(parent_is_neg) + } + + fn is_all_not(&self) -> bool { + self.filter.is_all_not() + } } pub struct BoostFilter<'a> { @@ -822,5 +960,13 @@ impl<'a> QueryRuntimeFilter for BoostFilter<'a> { fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { self.filter.prepare_relevancy_scoring(&mut qsi); } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + self.filter.check_double_not(parent_is_neg) + } + + fn is_all_not(&self) -> bool { + self.filter.is_all_not() + } } diff --git a/src/key_builder.rs b/src/key_builder.rs index 00823bb..09a4081 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -262,7 +262,8 @@ impl KeyBuilder { /* parses a seq and array path portion (ex "123,0,0,10) of a key into a doc result */ pub fn parse_doc_result_from_key(str: &str) -> DocResult { let mut dr = DocResult::new(); - let (_path_str, seq_str, arraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&str); + let (_path_str, seq_str, arraypath_str) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&str); dr.seq = seq_str.parse().unwrap(); if !arraypath_str.is_empty() { for numstr in arraypath_str.split(",") { @@ -274,15 +275,17 @@ impl KeyBuilder { pub fn compare_keys(akey: &str, bkey: &str) -> i32 { use std::cmp::Ordering; - assert!(akey.starts_with('W')); - assert!(bkey.starts_with('W')); - let (apath_str, aseq_str, aarraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); - let (bpath_str, bseq_str, barraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); + debug_assert!(akey.starts_with('W')); + debug_assert!(bkey.starts_with('W')); + let (apath_str, aseq_str, aarraypath_str) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); + let (bpath_str, bseq_str, barraypath_str) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); match apath_str[1..].cmp(&bpath_str[1..]) { - Ordering::Less => -1, - Ordering::Greater => 1, - Ordering::Equal => { + Ordering::Less => -1, + Ordering::Greater => 1, + Ordering::Equal => { let aseq: u64 = aseq_str.parse().unwrap(); let bseq: u64 = bseq_str.parse().unwrap();; if aseq < bseq { @@ -290,10 +293,39 @@ impl KeyBuilder { } else if aseq > bseq { 1 } else { - match aarraypath_str.cmp(barraypath_str) { - Ordering::Less => -1, - Ordering::Greater => 1, - Ordering::Equal => 0, + if aarraypath_str.is_empty() || barraypath_str.is_empty() { + match aarraypath_str.len().cmp(&barraypath_str.len()) { + Ordering::Less => -1, + Ordering::Greater => 1, + Ordering::Equal => 0, + } + } else { + let mut a_nums = aarraypath_str.split(","); + let mut b_nums = barraypath_str.split(","); + loop { + if let Some(ref a_num_str) = a_nums.next() { + if let Some(ref b_num_str) = b_nums.next() { + let a_num: u64 = a_num_str.parse().unwrap(); + let b_num: u64 = b_num_str.parse().unwrap(); + match a_num.cmp(&b_num) { + Ordering::Less => return -1, + Ordering::Greater => return 1, + Ordering::Equal => (), + } + } else { + //b is shorter than a, so greater + return 1; + } + } else { + if b_nums.next().is_some() { + //a is shorter than b so less + return -1; + } else { + // same length and must have hit all equal before this, so equal + return 0; + } + } + } } } }, diff --git a/src/parser.rs b/src/parser.rs index e0ab8db..d718134 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -12,7 +12,8 @@ use json_value::JsonValue; use query::{Sort, Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore, AggregateFun, SortInfo, SortField}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, - StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter}; + StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, + NotFilter}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs @@ -565,7 +566,15 @@ ws1 if !self.consume("find") { return Err(Error::Parse("Missing 'find' keyword".to_string())); } - self.object() + self.not_object() + } + + fn not_object<'b>(&'b mut self) -> Result, Error> { + if self.consume("!") { + Ok(Box::new(NotFilter::new(try!(self.object()), self.kb.arraypath_len()))) + } else { + self.object() + } } fn object<'b>(&'b mut self) -> Result, Error> { @@ -576,11 +585,11 @@ ws1 left = try!(self.consume_boost_and_wrap_filter(left)); if self.consume("&&") { - let right = try!(self.object()); + let right = try!(self.not_object()); Ok(Box::new(AndFilter::new(vec![left, right], self.kb.arraypath_len()))) } else if self.consume("||") { - let right = try!(self.object()); + let right = try!(self.not_object()); Ok(Box::new(OrFilter::new(left, right, self.kb.arraypath_len()))) } else { Ok(left) @@ -591,6 +600,9 @@ ws1 } fn parens<'b>(&'b mut self) -> Result, Error> { + if self.consume("!") { + return Ok(Box::new(NotFilter::new(try!(self.parens()), self.kb.arraypath_len()))); + } try!(self.must_consume("(")); let filter = try!(self.object()); try!(self.must_consume(")")); @@ -634,6 +646,15 @@ ws1 } fn oparens<'b>(&'b mut self) -> Result>, Error> { + let offset = self.offset; + if self.consume("!") { + if let Some(f) = try!(self.oparens()) { + return Ok(Some(Box::new(NotFilter::new(f, self.kb.arraypath_len())))); + } else { + self.offset = offset; + return Ok(None); + } + } let opt_filter = if self.consume("(") { let f = try!(self.obool()); try!(self.must_consume(")")); @@ -658,6 +679,9 @@ ws1 } fn compare<'b>(&'b mut self) -> Result, Error> { + if self.consume("!") { + return Ok(Box::new(NotFilter::new(try!(self.compare()), self.kb.arraypath_len()))); + } if self.consume("==") { let literal = try!(self.must_consume_string_literal()); let boost = try!(self.consume_boost()); @@ -754,6 +778,15 @@ ws1 } fn aparens<'b>(&'b mut self) -> Result>, Error> { + let offset = self.offset; + if self.consume("!") { + if let Some(f) = try!(self.aparens()) { + return Ok(Some(Box::new(NotFilter::new(f, self.kb.arraypath_len())))); + } else { + self.offset = offset; + return Ok(None); + } + } let opt_filter = if self.consume("(") { let f = try!(self.abool()); try!(self.must_consume(")")); diff --git a/src/query.rs b/src/query.rs index 3c5f3ff..5e989e3 100644 --- a/src/query.rs +++ b/src/query.rs @@ -67,11 +67,77 @@ impl DocResult { self.scores[term_ordinal].1 += 1; } + pub fn clone_only_seq_and_arraypath(&self) -> DocResult { + let mut dr = DocResult::new(); + dr.seq = self.seq; + dr.arraypath = self.arraypath.clone(); + dr + } + pub fn boost_scores(&mut self, boost: f32) { for &mut (ref mut score, ref mut _num_match) in self.scores.iter_mut() { *score *= boost; } } + + pub fn less(&self, other: &DocResult, mut array_depth: usize) -> bool { + if self.seq < other.seq { + return true; + } + let mut s = self.arraypath.iter(); + let mut o = other.arraypath.iter(); + loop { + if array_depth == 0 { + return false; + } + array_depth -= 1; + if let Some(i_s) = s.next() { + if let Some(i_o) = o.next() { + if i_s < i_o { + return true; + } + } else { + // self cannot be less than other + return false; + } + } else { + loop { + if array_depth == 0 { + return false; + } + array_depth -= 1; + if let Some(i_o) = o.next() { + if *i_o > 0 { + return true; + } + } else { + return true; + } + } + } + } + } + + // arraypaths must be the same length + pub fn cmp(&self, other: &DocResult) -> Ordering { + debug_assert_eq!(self.arraypath.len(), other.arraypath.len()); + match self.seq.cmp(&other.seq) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => self.arraypath.cmp(&other.arraypath), + } + } + + pub fn increment_last(&mut self, array_depth: usize) { + if array_depth == 0 { + self.seq += 1; + } else { + self.arraypath.resize(array_depth, 0); + if let Some(mut i) = self.arraypath.last_mut() { + *i += 1; + } + } + } } impl PartialEq for DocResult { @@ -86,22 +152,6 @@ impl PartialEq for DocResult { impl Eq for DocResult {} -impl PartialOrd for DocResult { - fn partial_cmp(&self, other: &DocResult) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for DocResult { - fn cmp(&self, other: &DocResult) -> Ordering { - match self.seq.cmp(&other.seq) { - Ordering::Less => Ordering::Less, - Ordering::Greater => Ordering::Greater, - Ordering::Equal => self.arraypath.cmp(&other.arraypath), - } - } -} - pub struct QueryScoringInfo { pub num_terms: usize, pub sum_of_idt_sqs: f32, @@ -122,6 +172,12 @@ impl Query { let mut returnable = try!(parser.return_clause()); let limit = try!(parser.limit_clause()); try!(parser.non_ws_left()); + try!(filter.check_double_not(false)); + + if filter.is_all_not() { + return Err(Error::Parse("query cannot be made up of only logical not. Must have at least \ + match clause not negated.".to_string())); + } let mut ags = Vec::new(); returnable.get_aggregate_funs(&mut ags); @@ -2027,6 +2083,137 @@ mod tests { } + #[test] + fn test_query_not() { + let dbname = "target/tests/querytestnot"; + + let _ = Index::delete(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + assert_eq!(Ok(()), index.add(r#"{"_id":"1", "bar": "fox"}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"2", "bar": "quick fox"}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"3", "bar": "quick brown fox"}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"4", "bar": ["fox"]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"5", "bar": ["quick fox"]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"6", "bar": ["quick brown fox"]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"7", "baz": ["fox"]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"8", "baz": ["quick","fox"]}"#)); + assert_eq!(Ok(()), index.add(r#"{"_id":"9", "baz": ["quick","brown","fox"]}"#)); + + index.flush().unwrap(); + + { + let mut query_results = Query::get_matches(r#"find {(bar: ~="fox" || bar: ~="brown") && (bar: !~="quick")} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {(bar: ~="fox" || bar: ~="brown") && !(bar: ~="quick")} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown"} && !{bar: ~="quick"} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: [(~="fox" || ~="brown") && !~="quick"]} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""4""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {bar: [(~="fox" || ~="brown") && !(~="quick")]} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""4""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {bar: [~="fox" || ~="brown"] && bar: ![~="quick"]} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""4""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {baz: [(~="fox" || ~="brown") && !~="quick"]} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""7""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""8""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""9""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let mut query_results = Query::get_matches(r#"find {baz: [(~="fox" || ~="brown") && !(~="quick")]} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""7""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""8""#.to_string())); + assert_eq!(query_results.next_result().unwrap(),Some(r#""9""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + + { + let mut query_results = Query::get_matches(r#"find {baz: [~="fox" || ~="brown"] && baz: ![~="quick"]} + return ._id "#.to_string(), &index).unwrap(); + + assert_eq!(query_results.next_result().unwrap(),Some(r#""7""#.to_string())); + assert_eq!(query_results.next_result().unwrap(), None); + } + + { + let result = Query::get_matches(r#"find !{baz: [~="fox"]} + return ._id "#.to_string(), &index); + match result { + Ok(_foo) => panic!("Didn't detect all logical nots."), + Err(_foo) => (), + } + } + + + { + let result = Query::get_matches(r#"find !{baz: ~="fox"} && !{baz: =="foo"} + return ._id "#.to_string(), &index); + match result { + Ok(_foo) => panic!("Didn't detect all logical nots."), + Err(_foo) => (), + } + } + + + { + let result = Query::get_matches(r#"find {foo: =="bar"} && !{baz: !~="fox"}} + return ._id "#.to_string(), &index); + match result { + Ok(_foo) => panic!("Didn't detect nested logical nots."), + Err(_foo) => (), + } + } + + } + #[test] fn test_query_more_docs() { let dbname = "target/tests/querytestdbmoredocs"; From c10c2f112d0b8bc5f406db443f0523b04903df50 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 12 Jan 2017 22:53:48 -0800 Subject: [PATCH 055/122] Removed obsolete grammar --- src/parser.rs | 62 --------------------------------------------------- 1 file changed, 62 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index d718134..e34f6b6 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -500,68 +500,6 @@ impl<'a> Parser<'a> { Ok(Some(lit)) } -/* - -find - = "find" ws object ws - -object - = "{" ws obool ws "}" ws (("&&" / "||") ws object)? - / parens - -parens - = "(" ws object ws ")" - -obool - = ws ocompare ws (('&&' / ',' / '||') ws obool)? - -ocompare - = oparens - / key ws ":" ws (oparens / compare) - -oparens - = '(' ws obool ws ')' ws - / array - / object - -compare - = ("==" / "~=" / "~" digits "=" ) ws string ws - -abool - = ws acompare ws (('&&'/ ',' / '||') ws abool)? - -acompare - = aparens - / compare - -aparens - = '(' ws abool ')' ws - / array - / object - -array - = '[' ws abool ']' ws - -key - = field / string - -field - = [a-z_$]i [a-z_$0-9]i* - -string - = '"' ('\\\\' / '\\' [\"tfvrnb] / [^\\\"])* '"' ws - -digits - = [0-9]+ - -ws - = [ \t\n\r]* - -ws1 - = [ \t\n\r]+ -*/ - - fn find<'b>(&'b mut self) -> Result, Error> { if !self.consume("find") { return Err(Error::Parse("Missing 'find' keyword".to_string())); From 64be8233c307060898e5c788c8de29fcd5ebb149 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 12 Jan 2017 22:54:13 -0800 Subject: [PATCH 056/122] Added tests for boolean precedence --- src/query.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/query.rs b/src/query.rs index 5e989e3..3a7dd4a 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1438,6 +1438,14 @@ mod tests { assert_eq!(query_results.get_next_id().unwrap(), Some("9".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); + query_results = Query::get_matches(r#"find {A:[ == "A1" && == "A" || == "A1"]}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), Some("8".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), Some("9".to_string())); + assert_eq!(query_results.get_next_id().unwrap(), None); + + query_results = Query::get_matches(r#"find {A:[=="A" || == "A1" && == "A"]}"#.to_string(), &index).unwrap(); + assert_eq!(query_results.get_next_id().unwrap(), None); + query_results = Query::get_matches(r#"find {A: ~= "Multi"}"#.to_string(), &index).unwrap(); assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); assert_eq!(query_results.get_next_id().unwrap(), None); From c3810856ccb17eb2b1ca35ff1733b51312f9a6f3 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 13 Jan 2017 12:08:02 -0800 Subject: [PATCH 057/122] Switched ExactMatchFilter to use the already stored fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also using u32 varints for word positions, moved field wordcount to new key and stopped using capnproto. This will save a lot space since capnproto needs a count for word positions, but since RocksDB nows the length of the values it’s a waste. All this combined will reduce the size of the index and allow much more data be to indexed. --- Cargo.toml | 5 +- build.rs | 3 +- capnp/main.rs | 11 -- capnp/records.capnp | 33 ------ src/error.rs | 11 -- src/filters.rs | 282 ++++++++++++++++++++++++-------------------- src/index.rs | 117 +++++++++--------- src/json_shred.rs | 71 ++++------- src/key_builder.rs | 30 +++++ src/lib.rs | 7 -- src/parser.rs | 47 ++++---- src/query.rs | 4 +- 12 files changed, 289 insertions(+), 332 deletions(-) delete mode 100644 capnp/main.rs delete mode 100644 capnp/records.capnp diff --git a/Cargo.toml b/Cargo.toml index fb39757..911ce70 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,13 +11,10 @@ build = "build.rs" [dependencies] -capnp = "0.7.4" rustc-serialize = "0.3.19" stemmer = "0.3.2" unicode-normalization = "0.1.2" unicode-segmentation = "0.1.2" rocksdb = "0.5.0" -backtrace = "0.2.0" +varint = "0.9.0" -[build-dependencies] -capnpc = "0.7.2" diff --git a/build.rs b/build.rs index fa6c048..3033f62 100644 --- a/build.rs +++ b/build.rs @@ -1,5 +1,4 @@ -extern crate capnpc; fn main() { - ::capnpc::compile("capnp", &["capnp/records.capnp"]).unwrap(); + } diff --git a/capnp/main.rs b/capnp/main.rs deleted file mode 100644 index e30c81f..0000000 --- a/capnp/main.rs +++ /dev/null @@ -1,11 +0,0 @@ -//extern crate capnp; -// -//pub mod records_capnp { -// include!(concat!(env!("OUT_DIR"), "/records_capnp.rs")); -//} -// -// -//pub mod records { -// use records_capnp::{header}; -// //use capnp::serialize_packed; -//} diff --git a/capnp/records.capnp b/capnp/records.capnp deleted file mode 100644 index c1dcd81..0000000 --- a/capnp/records.capnp +++ /dev/null @@ -1,33 +0,0 @@ -@0x89d4fcde0ae482cb; - -struct Header { - version @0 :UInt64; - highSeq @1 :UInt64; -} - -struct Payload { - - struct Wordinfo { - # Contains stemmed word and information about the orignal word before stemming - - # the position of the word in the text field - wordPos @0 :UInt32; - - # the offset of the suffix from the start of the stemmed word - # when combined with the stemmed word gets back the orignal - # text with case preserved - suffixOffset @1 :UInt32; - - # the actual suffix text, which can start at any point in the stemmed word - suffixText @2 :Text; - - # NOTE: at some point we should contain bit flags that indicate if the original string - # was propercase, all uppercase, contains a trailing space, a trailing period, - # a trailing period and space, etc up to 8 flags. This would mean less information would - # need to be stored in the suffix text for most words at the cost of 1 byte per word - # info. - } - totalWords @0: UInt32; - wordinfos @1 :List(Wordinfo); -} - diff --git a/src/error.rs b/src/error.rs index 59698e6..cb59532 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,4 +1,3 @@ -extern crate capnp; extern crate rocksdb; use std::{error, fmt}; @@ -11,7 +10,6 @@ use std::io; pub enum Error { Parse(String), Shred(String), - Capnp(capnp::Error), Rocks(rocksdb::Error), Write(String), Io(io::Error), @@ -28,7 +26,6 @@ impl error::Error for Error { match *self { Error::Parse(ref description) => description, Error::Shred(ref description) => description, - Error::Capnp(ref err) => err.description(), // XXX vmx 2016-11-07: It should be fixed on the RocksDB wrapper // that it has the std::error:Error implemented and hence // and err.description() @@ -42,7 +39,6 @@ impl error::Error for Error { match *self { Error::Parse(_) => None, Error::Shred(_) => None, - Error::Capnp(ref err) => Some(err as &error::Error), // NOTE vmx 2016-11-07: Looks like the RocksDB Wrapper needs to be // patched to be based on the std::error::Error trait Error::Rocks(_) => None, @@ -52,12 +48,6 @@ impl error::Error for Error { } } -impl From for Error { - fn from(err: capnp::Error) -> Error { - Error::Capnp(err) - } -} - impl From for Error { fn from(err: rocksdb::Error) -> Error { Error::Rocks(err) @@ -87,7 +77,6 @@ impl fmt::Display for Error { match *self { Error::Parse(ref err) => write!(f, "Parse error: {}", err), Error::Shred(ref err) => write!(f, "Shred error: {}", err), - Error::Capnp(ref err) => write!(f, "Capnproto error: {}", err), Error::Rocks(ref err) => write!(f, "RocksDB error: {}", err), Error::Write(ref err) => write!(f, "Write error: {}", err), Error::Io(ref err) => write!(f, "Io error: {}", err), diff --git a/src/filters.rs b/src/filters.rs index db38c24..728fadf 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -1,4 +1,4 @@ -extern crate capnp; +extern crate varint; use std::str; use std::cmp::Ordering; @@ -6,45 +6,49 @@ use std::collections::BTreeMap; use std::collections::HashSet; use index::Index; use std::f32; +use std::io::Cursor; use error::Error; use key_builder::KeyBuilder; -use stems::StemmedWord; -use query::{DocResult, QueryScoringInfo}; +use query::{DocResult, QueryScoringInfo, RetValue}; +use json_value::JsonValue; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{self, DBIterator, IteratorMode}; -use records_capnp::payload; +use rocksdb::{self, DBIterator, Snapshot, IteratorMode}; +use self::varint::VarintRead; struct Scorer { + iter: DBIterator, idf: f32, boost: f32, - keypathword_count_key: String, - keypath_count_key: String, + kb: KeyBuilder, + word: String, term_ordinal: usize, } impl Scorer { - fn new(word: &str, kb: &KeyBuilder, boost: f32) -> Scorer { + fn new(iter: DBIterator, word: &str, kb: &KeyBuilder, boost: f32) -> Scorer { Scorer { + iter: iter, idf: f32::NAN, boost: boost, - keypathword_count_key: kb.keypathword_count_key(&word), - keypath_count_key: kb.keypath_count_key(), + kb: kb.clone(), + word: word.to_string(), term_ordinal: 0, } } - fn init(&mut self, mut iter: &mut DBIterator, qsi: &mut QueryScoringInfo) { - let doc_freq = if let Some(bytes) = self.get_value(&mut iter, - &self.keypathword_count_key) { - Index::convert_bytes_to_u64(bytes.as_ref()) as f32 + fn init(&mut self, qsi: &mut QueryScoringInfo) { + let key = self.kb.keypathword_count_key(&self.word); + let doc_freq = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_u32(bytes.as_ref()) as f32 } else { 0.0 }; - - let num_docs = if let Some(bytes) = self.get_value(&mut iter, &self.keypath_count_key) { - Index::convert_bytes_to_u64(bytes.as_ref()) as f32 + + let key = self.kb.keypath_count_key(); + let num_docs = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_u32(bytes.as_ref()) as f32 } else { 0.0 }; @@ -55,9 +59,9 @@ impl Scorer { qsi.sum_of_idt_sqs += self.idf * self.idf; } - fn get_value(&self, iter: &mut DBIterator, key: &String) -> Option> { - iter.set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); - if let Some((ret_key, ret_value)) = iter.next() { + fn get_value(&mut self, key: &str) -> Option> { + self.iter.set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); + if let Some((ret_key, ret_value)) = self.iter.next() { if ret_key.len() == key.len() && ret_key.starts_with(key.as_bytes()) { Some(ret_value) } else { @@ -68,9 +72,15 @@ impl Scorer { } } - fn add_match_score(&self, num_matches: u32, - total_field_words: u32, dr: &mut DocResult) { + fn add_match_score(&mut self, num_matches: u32, dr: &mut DocResult) { if self.should_score() { + let key = self.kb.field_length_key_from_doc_result(dr); + let total_field_words = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_u32(bytes.as_ref()) as f32 + } else { + panic!("Couldn't find field length for a match!! WHAT!"); + }; + let tf: f32 = (num_matches as f32).sqrt(); let norm = 1.0/(total_field_words as f32).sqrt(); let score = self.idf * self.idf * tf * norm * self.boost; @@ -95,89 +105,6 @@ pub trait QueryRuntimeFilter { fn is_all_not(&self) -> bool; } -pub struct ExactMatchFilter { - iter: DBIterator, - keypathword: String, - word_pos: u32, - suffix_offset: u32, - suffix: String, - scorer: Scorer, -} - - - -impl ExactMatchFilter { - pub fn new(iter: DBIterator, stemmed_word: &StemmedWord, - kb: &KeyBuilder, boost: f32) -> ExactMatchFilter { - ExactMatchFilter{ - iter: iter, - keypathword: kb.get_keypathword_only(&stemmed_word.stemmed), - word_pos: stemmed_word.word_pos, - suffix: stemmed_word.suffix.clone(), - suffix_offset: stemmed_word.suffix_offset, - scorer: Scorer::new(&stemmed_word.stemmed, &kb, boost), - } - } -} - -impl QueryRuntimeFilter for ExactMatchFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); - // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); - - KeyBuilder::truncate_to_keypathword(&mut self.keypathword); - - self.next_result() - } - - fn next_result(&mut self) -> Result, Error> { - loop { - let (key, value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return Ok(None), - }; - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return Ok(None) - } - - // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed - // and why we pass on mutable reference of it to `read_message()` - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - let wordinfos = try!(payload.get_wordinfos()); - for wi in wordinfos.iter() { - if self.word_pos == wi.get_word_pos() && - self.suffix_offset == wi.get_suffix_offset() && - self.suffix == try!(wi.get_suffix_text()) { - // We have a candidate document to return - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); - self.scorer.add_match_score(wordinfos.len(), - payload.get_total_words(), &mut dr); - return Ok(Some(dr)); - } - } - } - } - - fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { - self.scorer.init(&mut self.iter, &mut qsi); - } - - fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { - Ok(()) - } - - fn is_all_not(&self) -> bool { - false - } -} pub struct StemmedWordFilter { iter: DBIterator, @@ -186,19 +113,18 @@ pub struct StemmedWordFilter { } impl StemmedWordFilter { - pub fn new(iter: DBIterator, stemmed_word: &str, + pub fn new(snapshot: &Snapshot, stemmed_word: &str, kb: &KeyBuilder, boost: f32) -> StemmedWordFilter { StemmedWordFilter { - iter: iter, + iter: snapshot.iterator(IteratorMode::Start), keypathword: kb.get_keypathword_only(&stemmed_word), - scorer: Scorer::new(stemmed_word, kb, boost), + scorer: Scorer::new(snapshot.iterator(IteratorMode::Start), stemmed_word, kb, boost), } } } impl QueryRuntimeFilter for StemmedWordFilter { fn first_result(&mut self, start: &DocResult) -> Result, Error> { - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); // Seek in index to >= entry self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), @@ -224,20 +150,21 @@ impl QueryRuntimeFilter for StemmedWordFilter { let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); if self.scorer.should_score() { - let message_reader = ::capnp::serialize_packed::read_message( - &mut &*value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - - self.scorer.add_match_score(try!(payload.get_wordinfos()).len(), - payload.get_total_words(), &mut dr); + let mut vec = Vec::with_capacity(value.len()); + vec.extend(value.into_iter()); + let mut bytes = Cursor::new(vec); + let mut count = 0; + while let Ok(_pos) = bytes.read_unsigned_varint_32() { + count += 1; + } + self.scorer.add_match_score(count, &mut dr); } Ok(Some(dr)) } fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { - self.scorer.init(&mut self.iter, &mut qsi); + self.scorer.init(&mut qsi); } fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { @@ -258,11 +185,13 @@ pub struct StemmedWordPosFilter { } impl StemmedWordPosFilter { - pub fn new(iter: DBIterator, stemmed_word: &str, kb: &KeyBuilder, boost: f32) -> StemmedWordPosFilter { + pub fn new(snapshot: &Snapshot, stemmed_word: &str, + kb: &KeyBuilder, boost: f32) -> StemmedWordPosFilter { StemmedWordPosFilter{ - iter: iter, + iter: snapshot.iterator(IteratorMode::Start), keypathword: kb.get_keypathword_only(&stemmed_word), - scorer: Scorer::new(&stemmed_word, &kb, boost), + scorer: Scorer::new(snapshot.iterator(IteratorMode::Start), + &stemmed_word, &kb, boost), } } @@ -289,24 +218,24 @@ impl StemmedWordPosFilter { return Ok(None) } - let message_reader = ::capnp::serialize_packed::read_message( - &mut &*value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - let positions: Vec = try!(payload.get_wordinfos()).iter() - .map(|wi| wi.get_word_pos()) - .collect(); - let key_str = unsafe{str::from_utf8_unchecked(&key)}; let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); - self.scorer.add_match_score(positions.len() as u32, payload.get_total_words(), &mut dr); + let mut vec = Vec::with_capacity(value.len()); + vec.extend(value.into_iter()); + let mut bytes = Cursor::new(vec); + let mut positions = Vec::new(); + while let Ok(pos) = bytes.read_unsigned_varint_32() { + positions.push(pos); + } + + self.scorer.add_match_score(positions.len() as u32, &mut dr); Ok(Some((dr, positions))) } fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { - self.scorer.init(&mut self.iter, &mut qsi); + self.scorer.init(&mut qsi); } } @@ -316,6 +245,7 @@ pub struct StemmedPhraseFilter { impl StemmedPhraseFilter { pub fn new(filters: Vec) -> StemmedPhraseFilter { + assert!(filters.len() > 0); StemmedPhraseFilter { filters: filters, } @@ -329,6 +259,10 @@ impl StemmedPhraseFilter { if base.is_none() { return Ok(None); } let (mut base_result, mut base_positions) = base.unwrap(); + if matches_left == 0 { + return Ok(Some(base_result)); + } + let mut current_filter = 0; loop { current_filter += 1; @@ -344,7 +278,7 @@ impl StemmedPhraseFilter { if base_result == next_result { let mut new_positions = Vec::new(); for &pos in next_positions.iter() { - if let Ok(_) = base_positions.binary_search(&(pos-1)) { + if let Ok(_) = base_positions.binary_search(&(pos.saturating_sub(1))) { new_positions.push(pos); } } @@ -410,6 +344,92 @@ impl QueryRuntimeFilter for StemmedPhraseFilter { } } + +pub struct ExactMatchFilter { + iter: DBIterator, + filter: StemmedPhraseFilter, + kb: KeyBuilder, + phrase: String, + case_sensitive: bool, +} + +impl ExactMatchFilter { + pub fn new(snapshot: &Snapshot, filter: StemmedPhraseFilter, + kb: KeyBuilder, phrase: String, case_sensitive: bool) -> ExactMatchFilter { + ExactMatchFilter { + iter: snapshot.iterator(IteratorMode::Start), + filter: filter, + kb: kb, + phrase: if case_sensitive {phrase} else {phrase.to_lowercase()}, + case_sensitive: case_sensitive, + } + } + + fn check_exact(&mut self, mut dr: DocResult) -> Result, Error> { + loop { + let value_key = self.kb.value_key_from_doc_result(&dr); + + self.iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + if let Some((key, value)) = self.iter.next() { + debug_assert!(key.starts_with(value_key.as_bytes())); // must always be true! + if let JsonValue::String(string) = RetValue::bytes_to_json_value(&*value) { + let matches = if self.case_sensitive { + self.phrase == string + } else { + self.phrase == string.to_lowercase() + }; + if matches { + return Ok(Some(dr)); + } else { + if let Some(next) = try!(self.filter.next_result()) { + dr = next; + // continue looping + } else { + return Ok(None); + } + } + } else { + panic!("Not a string, wtf!"); + } + } else { + panic!("Couldn't find value, hulk smash!"); + } + } + } +} + +impl QueryRuntimeFilter for ExactMatchFilter { + fn first_result(&mut self, start: &DocResult) -> Result, Error> { + if let Some(dr) = try!(self.filter.first_result(start)) { + self.check_exact(dr) + } else { + Ok(None) + } + } + + fn next_result(&mut self) -> Result, Error> { + if let Some(dr) = try!(self.filter.next_result()) { + self.check_exact(dr) + } else { + Ok(None) + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + self.filter.prepare_relevancy_scoring(&mut qsi); + } + + fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { + self.filter.check_double_not(parent_is_neg) + } + + fn is_all_not(&self) -> bool { + self.filter.is_all_not() + } +} + pub struct DistanceFilter { filters: Vec, current_filter: usize, diff --git a/src/index.rs b/src/index.rs index 4f6cd93..eb52fea 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,10 +1,13 @@ extern crate rocksdb; +extern crate varint; use std::collections::HashMap; use std::str; +use std::io::Cursor; use std::mem; +use std::io::Write; -use records_capnp::header; +use self::varint::{VarintRead, VarintWrite}; use rocksdb::MergeOperands; @@ -13,32 +16,6 @@ use json_shred::{Shredder}; const NOISE_HEADER_VERSION: u64 = 1; -struct Header { - version: u64, - high_seq: u64, -} - -impl Header { - fn new() -> Header { - Header{ - version: NOISE_HEADER_VERSION, - high_seq: 0, - } - } - fn serialize(&self) -> Vec { - let mut message = ::capnp::message::Builder::new_default(); - { - let mut header = message.init_root::(); - header.set_version(self.version); - header.set_high_seq(self.high_seq); - } - let mut bytes = Vec::new(); - ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); - bytes - } -} - - pub struct Index { write_options: rocksdb::WriteOptions, high_doc_seq: u64, @@ -82,24 +59,25 @@ impl Index { let rocks = try!(rocksdb::DB::open(&rocks_options, name)); - let header = Header::new(); - let status = rocks.put_opt(b"HDB", &*header.serialize(), &self.write_options); - println!("put was ok? {}", status.is_ok()); + + let mut bytes = Vec::with_capacity(8*2); + bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); + bytes.write(&Index::convert_u64_to_bytes(0)).unwrap(); + try!(rocks.put_opt(b"HDB", &bytes, &self.write_options)); + rocks } }; // validate header is there let value = try!(rocks.get(b"HDB")).unwrap(); - // NOTE vmx 2016-10-13: I'm not really sure why the dereferencing is needed - // and why we pass on mutable reference of it to `read_message()` - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let header = message_reader.get_root::().unwrap(); - assert_eq!(header.get_version(), NOISE_HEADER_VERSION); - self.high_doc_seq = header.get_high_seq(); self.rocks = Some(rocks); + assert_eq!(value.len(), 8*2); + // first 8 is version + assert_eq!(Index::convert_bytes_to_u64(&value[..8]), NOISE_HEADER_VERSION); + // next 8 is high seq + self.high_doc_seq = Index::convert_bytes_to_u64(&value[8..]); + Ok(()) } @@ -150,9 +128,10 @@ impl Index { try!(self.batch.as_mut().unwrap().put(id.as_bytes(), seq.as_bytes())); } - let mut header = Header::new(); - header.high_seq = self.high_doc_seq; - try!(self.batch.as_mut().unwrap().put(b"HDB", &*header.serialize())); + let mut bytes = Vec::with_capacity(8*2); + bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); + bytes.write(&Index::convert_u64_to_bytes(self.high_doc_seq)).unwrap(); + try!(self.batch.as_mut().unwrap().put(b"HDB", &bytes)); let status = try!(rocks.write(self.batch.take().unwrap())); // Make sure there's a always a valid WriteBarch after writing it into RocksDB, @@ -162,6 +141,36 @@ impl Index { Ok(status) } + /// Should not be used generally since it not varint. Used for header fields + /// since only one header is in the database it's not a problem with excess size. + fn convert_bytes_to_u64(bytes: &[u8]) -> u64 { + debug_assert!(bytes.len() == 8); + let mut buffer = [0; 8]; + for (n, b) in bytes.iter().enumerate() { + buffer[n] = *b; + } + unsafe{ mem::transmute(buffer) } + } + + /// Should not be used generally since it not varint. Used for header fields + /// since only one header is in the database it's not a problem with excess size. + fn convert_u64_to_bytes(val: u64) -> [u8; 8] { + unsafe{ mem::transmute(val) } + } + + pub fn convert_bytes_to_u32(bytes: &[u8]) -> u32 { + let mut vec = Vec::with_capacity(bytes.len()); + vec.extend(bytes.into_iter()); + let mut read = Cursor::new(vec); + read.read_unsigned_varint_32().unwrap() + } + + pub fn convert_u32_to_bytes(val: u32) -> Vec { + let mut bytes = Cursor::new(Vec::new()); + assert!(bytes.write_unsigned_varint_32(val).is_ok()); + bytes.into_inner() + } + pub fn fetch_id(&self, seq: u64) -> Result, String> { // Fetching an ID is only possible if the index is open // NOTE vmx 2016-10-17: Perhaps that shouldn't panic? @@ -192,19 +201,6 @@ impl Index { } } - pub fn convert_bytes_to_u64(bytes: &[u8]) -> u64 { - debug_assert!(bytes.len() == 8); - let mut buffer = [0; 8]; - for (n, b) in bytes.iter().enumerate() { - buffer[n] = *b; - } - unsafe{ mem::transmute(buffer) } - } - - pub fn convert_u64_to_bytes(val: u64) -> [u8; 8] { - unsafe{ mem::transmute(val) } - } - fn sum_merge(new_key: &[u8], existing_val: Option<&[u8]>, operands: &mut MergeOperands) @@ -213,17 +209,17 @@ impl Index { panic!("unknown key type to merge!"); } - let mut count:u64 = if let Some(bytes) = existing_val { - Index::convert_bytes_to_u64(&bytes) + let mut count = if let Some(bytes) = existing_val { + Index::convert_bytes_to_u32(&bytes) } else { 0 }; for bytes in operands { - count += Index::convert_bytes_to_u64(&bytes); + count += Index::convert_bytes_to_u32(&bytes); } - Index::convert_u64_to_bytes(count).into_iter().map(|b| *b).collect() + Index::convert_u32_to_bytes(count) } } @@ -235,9 +231,12 @@ mod tests { #[test] fn test_open() { + let dbname = "target/tests/firstnoisedb"; + let _ = Index::delete(dbname); + let mut index = Index::new(); //let db = super::Index::open("firstnoisedb", Option::None).unwrap(); - index.open("target/tests/firstnoisedb", Some(OpenOptions::Create)).unwrap(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); index.flush().unwrap(); } } diff --git a/src/json_shred.rs b/src/json_shred.rs index 402ab92..b567052 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -1,36 +1,25 @@ extern crate rocksdb; extern crate rustc_serialize; +extern crate varint; use std::collections::HashMap; use std::mem::transmute; use std::io::Write; use std::str::Chars; +use std::io::Cursor; +use self::varint::VarintWrite; use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; use error::Error; use key_builder::KeyBuilder; -use records_capnp::payload; use stems::Stems; +use index::Index; // Good example of using rustc_serialize: https://github.com/ajroetker/beautician/blob/master/src/lib.rs // Callback based JSON streaming parser: https://github.com/gyscos/json-streamer.rs // Another parser pased on rustc_serializ: https://github.com/isagalaev/ijson-rust/blob/master/src/test.rs#L11 - -#[derive(Debug, PartialEq)] -struct WordInfo { - //offset in the text field where the stemmed text starts - word_pos: u32, - - // the start of the suffixText - suffix_offset: u32, - - // the suffix of the stemmed text. When applied over stemmed, the original - // text is returned. - suffix_text: String, -} - type ArrayOffsets = Vec; enum ObjectKeyTypes { @@ -70,49 +59,33 @@ impl Shredder { fn add_entries(&mut self, text: &String, docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { let stems = Stems::new(text.as_str()); - let mut word_to_word_infos = HashMap::new(); - let mut total_words = 0; + let mut word_to_word_positions = HashMap::new(); + let mut total_words: u32 = 0; + let mut one_enc_bytes = Cursor::new(Vec::new()); + assert!(one_enc_bytes.write_unsigned_varint_32(1).is_ok()); for stem in stems { - let word_infos = word_to_word_infos.entry(stem.stemmed).or_insert(Vec::new()); total_words += 1; - word_infos.push(WordInfo{ - word_pos: stem.word_pos, - suffix_text: stem.suffix.to_string(), - suffix_offset: stem.suffix_offset, - }); + let &mut (ref mut word_positions, ref mut count) = word_to_word_positions.entry(stem.stemmed) + .or_insert((Cursor::new(Vec::new()), 0)); + assert!(word_positions.write_unsigned_varint_32(stem.word_pos).is_ok()); + *count += 1; } - for (stemmed, word_infos) in word_to_word_infos { - let mut message = ::capnp::message::Builder::new_default(); - let count: u32; - { - let mut capn_payload = message.init_root::(); - count = word_infos.len() as u32; - capn_payload.set_total_words(total_words); - let mut capn_wordinfos = capn_payload.init_wordinfos(count); - for (pos, word_info) in word_infos.iter().enumerate() { - let mut capn_wordinfo = capn_wordinfos.borrow().get(pos as u32); - capn_wordinfo.set_word_pos(word_info.word_pos); - capn_wordinfo.set_suffix_text(&word_info.suffix_text); - capn_wordinfo.set_suffix_offset(word_info.suffix_offset); - } - } - - let mut bytes = Vec::new(); - ::capnp::serialize_packed::write_message(&mut bytes, &message).unwrap(); + for (stemmed, (word_positions, count)) in word_to_word_positions { let key = self.kb.stemmed_word_key(&stemmed, docseq); - try!(batch.put(&key.into_bytes(), &bytes)); + try!(batch.put(&key.into_bytes(), &word_positions.into_inner())); - let bytes = unsafe{ transmute::(count as u64) }; + let key = self.kb.field_length_key(docseq); + try!(batch.put(&key.into_bytes(), &Index::convert_u32_to_bytes(total_words))); + let key = self.kb.keypathword_count_key(&stemmed); - try!(batch.merge(&key.into_bytes(), &bytes)); + try!(batch.merge(&key.into_bytes(), &Index::convert_u32_to_bytes(count))); - let bytes = unsafe{ transmute::(1) }; let key = self.kb.keypath_count_key(); - try!(batch.merge(&key.into_bytes(), &bytes)); - + try!(batch.merge(&key.into_bytes(), one_enc_bytes.get_ref())); } + let key = self.kb.value_key(docseq); let mut buffer = String::with_capacity(text.len() + 1); buffer.push('s'); @@ -326,7 +299,7 @@ impl Shredder { } } - +/* #[cfg(test)] mod tests { extern crate rocksdb; @@ -455,4 +428,4 @@ mod tests { let result = wordinfos_from_rocks(&rocks); assert!(result.is_empty()); } -} +} */ diff --git a/src/key_builder.rs b/src/key_builder.rs index 09a4081..9a9014c 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -63,6 +63,36 @@ impl KeyBuilder { string } + /// Builds a field length key for the seq, using the key_path and arraypath + /// built up internally. + pub fn field_length_key(&self, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push('L'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('#'); + string.push_str(seq.to_string().as_str()); + + KeyBuilder::add_arraypath(&mut string, &self.arraypath); + string + } + + /// Builds a field length key for the DocResult, using the key_path + /// built up internally and the arraypath from the DocResult. + pub fn field_length_key_from_doc_result(&self, dr: &DocResult) -> String { + let mut string = String::with_capacity(100); + string.push('L'); + for segment in &self.keypath { + string.push_str(&segment); + } + string.push('#'); + string.push_str(dr.seq.to_string().as_str()); + + KeyBuilder::add_arraypath(&mut string, &dr.arraypath); + string + } + /// Adds DocResult seq and array path an already created keypathword. pub fn add_doc_result_to_keypathword(keypathword: &mut String, dr: &DocResult) { keypathword.push_str(dr.seq.to_string().as_str()); diff --git a/src/lib.rs b/src/lib.rs index 85830c4..04e1dc4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,3 @@ -extern crate capnp; extern crate rocksdb; mod error; @@ -10,9 +9,3 @@ mod parser; mod stems; pub mod index; pub mod query; - -// include capnp code generated by `build.rs` -mod records_capnp { - #![allow(dead_code)] - include!(concat!(env!("OUT_DIR"), "/records_capnp.rs")); -} diff --git a/src/parser.rs b/src/parser.rs index e34f6b6..e8465a1 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -17,7 +17,7 @@ use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWo // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{IteratorMode, Snapshot}; +use rocksdb::Snapshot; pub struct Parser<'a> { @@ -623,19 +623,16 @@ impl<'a> Parser<'a> { if self.consume("==") { let literal = try!(self.must_consume_string_literal()); let boost = try!(self.consume_boost()); - let stems = Stems::new(&literal); - let mut filters: Vec> = Vec::new(); - for stem in stems { - let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = Box::new(ExactMatchFilter::new( - iter, &stem, &self.kb, boost)); - filters.push(filter); - } - match filters.len() { - 0 => panic!("Cannot create a ExactMatchFilter"), - 1 => Ok(filters.pop().unwrap()), - _ => Ok(Box::new(AndFilter::new(filters, self.kb.arraypath_len()))), + let mut filters: Vec = Vec::new(); + { + let stems = Stems::new(&literal); + for stem in stems { + filters.push(StemmedWordPosFilter::new(&self.snapshot, + &stem.stemmed, &self.kb, boost)); + } } + let filter = StemmedPhraseFilter::new(filters); + Ok(Box::new(ExactMatchFilter::new(&self.snapshot, filter, self.kb.clone(), literal, true))) } else if self.consume("~=") { // regular search let literal = try!(self.must_consume_string_literal()); @@ -646,15 +643,14 @@ impl<'a> Parser<'a> { match stemmed_words.len() { 0 => panic!("Cannot create a StemmedWordFilter"), 1 => { - let iter = self.snapshot.iterator(IteratorMode::Start); - Ok(Box::new(StemmedWordFilter::new(iter, &stemmed_words[0], &self.kb, boost))) + Ok(Box::new(StemmedWordFilter::new(&self.snapshot, + &stemmed_words[0], &self.kb, boost))) }, _ => { let mut filters: Vec = Vec::new(); for stemmed_word in stemmed_words { - let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = StemmedWordPosFilter::new(iter, &stemmed_word, - &self.kb, boost); + let filter = StemmedWordPosFilter::new(&self.snapshot, + &stemmed_word, &self.kb, boost); filters.push(filter); } Ok(Box::new(StemmedPhraseFilter::new(filters))) @@ -674,9 +670,8 @@ impl<'a> Parser<'a> { let stems = Stems::new(&literal); let mut filters: Vec = Vec::new(); for stem in stems { - let iter = self.snapshot.iterator(IteratorMode::Start); - let filter = StemmedWordPosFilter::new( - iter, &stem.stemmed, &self.kb, boost); + let filter = StemmedWordPosFilter::new(&self.snapshot, + &stem.stemmed, &self.kb, boost); filters.push(filter); } if word_distance > std::u32::MAX as i64 { @@ -1081,8 +1076,11 @@ mod tests { #[test] fn test_whitespace() { + let dbname = "target/tests/test_whitespace"; + let _ = Index::delete(dbname); + let mut index = Index::new(); - index.open("target/tests/test_whitespace", Some(OpenOptions::Create)).unwrap(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); let rocks = &index.rocks.unwrap(); let mut snapshot = Snapshot::new(rocks); @@ -1100,8 +1098,11 @@ mod tests { #[test] fn test_must_consume_string_literal() { + let dbname = "target/tests/test_must_consume_string_literal"; + let _ = Index::delete(dbname); + let mut index = Index::new(); - index.open("target/tests/test_must_consume_string_literal", Some(OpenOptions::Create)).unwrap(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); let rocks = &index.rocks.unwrap(); let snapshot = Snapshot::new(rocks); diff --git a/src/query.rs b/src/query.rs index 3a7dd4a..c6ebf44 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1044,7 +1044,7 @@ pub struct RetValue { } impl RetValue { - fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { + pub fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { match bytes[0] as char { 's' => { let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); @@ -1166,7 +1166,7 @@ impl RetValue { value_key_next.truncate(value_key.len()); value_key_next.push_str(&escaped2); } else { - return RetValue::return_array(array);; + return RetValue::return_array(array); } } }, From 62cd2c4383d0d6589e44ba70b0c77c2b5e48232d Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 13 Jan 2017 13:27:29 -0800 Subject: [PATCH 058/122] Reenable tests in json_shred and remove dead code in stemmer Changed the tests to new positions only payload info --- src/json_shred.rs | 77 +++++++++++++++---------------------- src/stems.rs | 97 ++++++++--------------------------------------- 2 files changed, 46 insertions(+), 128 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index b567052..ded1c0a 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -299,34 +299,32 @@ impl Shredder { } } -/* + #[cfg(test)] mod tests { extern crate rocksdb; + extern crate varint; + + use self::varint::VarintRead; + + use std::io::Cursor; use std::str; - use records_capnp; - use super::{WordInfo}; + use index::{Index, OpenOptions}; - fn wordinfos_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, Vec)> { + fn positions_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, Vec)> { let mut result = Vec::new(); for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { if key[0] as char == 'W' { - let mut ref_value = &*value; - let message_reader = ::capnp::serialize_packed::read_message( - &mut ref_value, ::capnp::message::ReaderOptions::new()).unwrap(); - let payload = message_reader.get_root::().unwrap(); - - let mut wordinfos = Vec::new(); - for wi in payload.get_wordinfos().unwrap().iter() { - wordinfos.push(WordInfo{ - word_pos: wi.get_word_pos(), - suffix_text: wi.get_suffix_text().unwrap().to_string(), - suffix_offset: wi.get_suffix_offset(), - }); + let mut vec = Vec::with_capacity(value.len()); + vec.extend(value.into_iter()); + let mut bytes = Cursor::new(vec); + let mut positions = Vec::new(); + while let Ok(pos) = bytes.read_unsigned_varint_32() { + positions.push(pos); } let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); - result.push((key_string, wordinfos)); + result.push((key_string, positions)); } } result @@ -349,17 +347,13 @@ mod tests { let rocks = &index.rocks.unwrap(); rocks.write(batch).unwrap(); - let result = wordinfos_from_rocks(&rocks); + let result = positions_from_rocks(&rocks); let expected = vec![ - ("W.some$!array#123,0".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 5 }]), - ("W.some$!data#123,1".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), - ("W.some$$!also#123,2,0".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 4 }]), - ("W.some$$!nest#123,2,1".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "ed".to_string(), suffix_offset: 4 }]), + ("W.some$!array#123,0".to_string(), vec![0]), + ("W.some$!data#123,1".to_string(), vec![0]), + ("W.some$$!also#123,2,0".to_string(), vec![0]), + ("W.some$$!nest#123,2,1".to_string(), vec![0]), ]; assert_eq!(result, expected); } @@ -384,26 +378,17 @@ mod tests { let rocks = &index.rocks.unwrap(); rocks.write(batch).unwrap(); - let result = wordinfos_from_rocks(&rocks); + let result = positions_from_rocks(&rocks); println!("result: {:?}", result); let expected = vec![ - ("W.A$.B!b1#1234,1".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), - ("W.A$.B!b2vmx#1234,0".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "B2 VMX ".to_string(), - suffix_offset: 0 }]), - ("W.A$.B!three#1234,0".to_string(), vec![ - WordInfo { word_pos: 10, suffix_text: "".to_string(), suffix_offset: 15 }]), - ("W.A$.B!two#1234,0".to_string(), vec![ - WordInfo { word_pos: 6, suffix_text: " ".to_string(), suffix_offset: 9 }]), - ("W.A$.C!..#1234,0".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), - ("W.A$.C!..#1234,1".to_string(), vec![ - WordInfo { word_pos: 0, suffix_text: "".to_string(), suffix_offset: 2 }]), - ("W.A$.C!c2#1234,0".to_string(), vec![ - WordInfo { word_pos: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), - ("W.A$.C!c2#1234,1".to_string(), vec![ - WordInfo { word_pos: 2, suffix_text: "C2".to_string(), suffix_offset: 2 }]), + ("W.A$.B!b1#1234,1".to_string(), vec![0]), + ("W.A$.B!b2vmx#1234,0".to_string(), vec![0]), + ("W.A$.B!three#1234,0".to_string(), vec![10]), + ("W.A$.B!two#1234,0".to_string(), vec![6]), + ("W.A$.C!..#1234,0".to_string(), vec![0]), + ("W.A$.C!..#1234,1".to_string(), vec![0]), + ("W.A$.C!c2#1234,0".to_string(), vec![2]), + ("W.A$.C!c2#1234,1".to_string(), vec![2]), ]; assert_eq!(result, expected); } @@ -425,7 +410,7 @@ mod tests { let rocks = &index.rocks.unwrap(); rocks.write(batch).unwrap(); - let result = wordinfos_from_rocks(&rocks); + let result = positions_from_rocks(&rocks); assert!(result.is_empty()); } -} */ +} diff --git a/src/stems.rs b/src/stems.rs index 48a37ac..16dcc83 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -19,13 +19,8 @@ pub struct Stems<'a> { pub struct StemmedWord { // Where the stemmed word starts pub word_pos: u32, - // Where the suffix starts - pub suffix_offset: u32, // The stemmed word pub stemmed: String, - // The difference between the stemmed word and the original lowercased one. It can be - // used to recontruct the original word (for exact match searches) - pub suffix: String, } @@ -37,14 +32,6 @@ impl<'a> Stems<'a> { word_position: 0, } } - - /// Return the *byte* length of the common prefix between two strings - fn common_prefix_len(aa: &str, bb: &str) -> usize { - aa.chars() - .zip(bb.chars()) - .take_while(|&(a, b)| a == b) - .fold(0, |acc, (a, _)| acc + a.len_utf8()) - } } impl<'a> Iterator for Stems<'a> { @@ -75,9 +62,7 @@ impl<'a> Iterator for Stems<'a> { // wouldn't be possible. return Some(StemmedWord { word_pos: 0, - suffix_offset: 0, stemmed: String::new(), - suffix: String::new(), }); } else { return None; @@ -96,15 +81,12 @@ impl<'a> Iterator for Stems<'a> { self.word_position += 1; return Some(StemmedWord { word_pos: 0, - suffix_offset: word_to_stem.len() as u32, stemmed: word_to_stem, - suffix: String::new(), }); } // normalized contains our stemmable word. advance the iter since we only peeked. self.words.next(); word_to_stem = normalized; - let mut suffix = word_to_stem.clone(); loop { // loop through all non-alphabetic chars and add to suffix match self.words.peek() { @@ -113,7 +95,6 @@ impl<'a> Iterator for Stems<'a> { if normalized.chars().next().unwrap().is_alphabetic() { break; } else { - suffix.push_str(&normalized); self.words.next(); } }, @@ -121,12 +102,9 @@ impl<'a> Iterator for Stems<'a> { } } let stemmed = self.stemmer.stem(&word_to_stem.to_lowercase()); - let prefix_len = Stems::common_prefix_len(&stemmed, &suffix); let ret = StemmedWord { word_pos: self.word_position as u32, - suffix_offset: prefix_len as u32, stemmed: stemmed, - suffix: (&suffix[prefix_len..]).to_string(), }; self.word_position += 1; Some(ret) @@ -143,21 +121,14 @@ mod tests { let input = "THEse Words deeplY test smOOthly that stemmING"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { word_pos: 0, suffix_offset: 0, - stemmed: String::from("these"), suffix: String::from("THEse ") }, - StemmedWord { word_pos: 1, suffix_offset: 0, - stemmed: String::from("word"), suffix: String::from("Words ") }, + StemmedWord { word_pos: 0, stemmed: String::from("these")}, + StemmedWord { word_pos: 1, stemmed: String::from("word")}, // "deeply" stems to "deepli" - StemmedWord { word_pos: 2, suffix_offset: 5, - stemmed: String::from("deepli"), suffix: String::from("Y ") }, - StemmedWord { word_pos: 3, suffix_offset: 4, - stemmed: String::from("test"), suffix: String::from(" ") }, - StemmedWord { word_pos: 4, suffix_offset: 2, - stemmed: String::from("smooth"), suffix: String::from("OOthly ") }, - StemmedWord { word_pos: 5, suffix_offset: 4, - stemmed: String::from("that"), suffix: String::from(" ") }, - StemmedWord { word_pos: 6, suffix_offset: 4, - stemmed: String::from("stem"), suffix: String::from("mING") }, + StemmedWord { word_pos: 2, stemmed: String::from("deepli")}, + StemmedWord { word_pos: 3, stemmed: String::from("test")}, + StemmedWord { word_pos: 4, stemmed: String::from("smooth")}, + StemmedWord { word_pos: 5, stemmed: String::from("that")}, + StemmedWord { word_pos: 6, stemmed: String::from("stem")}, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { @@ -169,10 +140,7 @@ mod tests { fn test_stems_nonchars() { let input = " @#$!== \t+-"; let result = Stems::new(input).collect::>(); - assert_eq!(result, vec![ - StemmedWord { word_pos: 0, suffix_offset: 12, - stemmed: String::from(" @#$!== \t+-"), suffix: String::from("") }, - ]); + assert_eq!(result, vec![StemmedWord { word_pos: 0, stemmed: String::from(" @#$!== \t+-")}]); } #[test] @@ -180,12 +148,9 @@ mod tests { let input = "@!? Let's seeing..."; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { word_pos: 0, suffix_offset: 6, - stemmed: String::from("@!? "), suffix: String::from("") }, - StemmedWord { word_pos: 1, suffix_offset: 0, - stemmed: String::from("let"), suffix: String::from("Let's ") }, - StemmedWord { word_pos: 2, suffix_offset: 3, - stemmed: String::from("see"), suffix: String::from("ing...") }, + StemmedWord { word_pos: 0, stemmed: String::from("@!? ")}, + StemmedWord { word_pos: 1, stemmed: String::from("let")}, + StemmedWord { word_pos: 2, stemmed: String::from("see")}, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { @@ -198,10 +163,8 @@ mod tests { let input = "Ünicöde stemming"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { word_pos: 0, suffix_offset: 0, - stemmed: String::from("ünicöd"), suffix: String::from("Ünicöde ") }, - StemmedWord { word_pos: 1, suffix_offset: 4, - stemmed: String::from("stem"), suffix: String::from("ming") }, + StemmedWord { word_pos: 0, stemmed: String::from("ünicöd")}, + StemmedWord { word_pos: 1, stemmed: String::from("stem")}, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { @@ -213,10 +176,7 @@ mod tests { fn test_stems_unicode_lowercase_has_more_bytes() { let input = "İ"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { word_pos: 0, suffix_offset: 0, - stemmed: String::from("i̇"), suffix: String::from("İ") }, - ]; + let expected = vec![StemmedWord { word_pos: 0, stemmed: String::from("i̇")}]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -249,38 +209,11 @@ mod tests { let input = "\u{03A1}\u{0313}\u{03C1}\u{0313}\u{1FE4}"; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { word_pos: 0, suffix_offset: 0, - stemmed: String::from("\u{03C1}\u{0313}\u{1FE4}\u{1FE4}"), - suffix: String::from("\u{03A1}\u{0313}\u{1FE4}\u{1FE4}") }, + StemmedWord { word_pos: 0, stemmed: String::from("\u{03C1}\u{0313}\u{1FE4}\u{1FE4}")}, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); } } - - #[test] - fn test_common_prefix_len() { - let tests = vec![ - ("a", "a", 1), - ("ab", "a", 1), - ("a", "ab", 1), - ("ab", "ab", 2), - ("a", "b", 0), - ("b", "a", 0), - ("ab", "cd", 0), - ("ab", "bc", 0), - ("abc", "abd", 2), - ("ac", "abcd", 1), - (" a", "a", 0), - ("a", "a ", 1), - ("xyzabc", "xyz", 3), - ("xyz", "xyzabc", 3), - ("öxyz", "öx", 3), - ]; - for (aa, bb, expected) in tests { - let prefix_len = Stems::common_prefix_len(aa, bb); - assert_eq!(prefix_len, expected); - } - } } From 1305ec8f8bc3e9affd4417627bf4c8941496305e Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 13 Jan 2017 15:48:52 -0800 Subject: [PATCH 059/122] Normalize object keys in key_builder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All keys are now normalized when added to key_builder. Added test for two semantically equivalent keys that won’t match in the index if not normalized. --- src/key_builder.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/key_builder.rs b/src/key_builder.rs index 9a9014c..62977f3 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -1,6 +1,10 @@ +extern crate unicode_normalization; + use query::DocResult; use std::str; +use self::unicode_normalization::UnicodeNormalization; + pub enum Segment { ObjectKey(String), Array(u64), @@ -77,7 +81,7 @@ impl KeyBuilder { KeyBuilder::add_arraypath(&mut string, &self.arraypath); string } - + /// Builds a field length key for the DocResult, using the key_path /// built up internally and the arraypath from the DocResult. pub fn field_length_key_from_doc_result(&self, dr: &DocResult) -> String { @@ -217,7 +221,9 @@ impl KeyBuilder { pub fn push_object_key(&mut self, key: &str) { let mut escaped_key = String::with_capacity((key.len() * 2) + 1); // max expansion escaped_key.push('.'); - for cc in key.chars() { + + // normalize the key otherwise we might not match unnormalized but equivelent keys + for cc in key.nfkc() { // Escape chars that conflict with delimiters if "\\$.!#".contains(cc) { escaped_key.push('\\'); @@ -408,6 +414,13 @@ mod tests { assert_eq!(kb.keypath_segments_len(), 0, "No segments so far"); } + #[test] + fn test_segments_canonical() { + let mut kb = KeyBuilder::new(); + kb.push_object_key("\u{0041}\u{030A}"); + assert_eq!(kb.stemmed_word_key("word", 1), "W.Å!word#1,"); + } + #[test] fn test_doc_result_parse() { let key = "W.foo$.bar$!word#123,1,0".to_string(); From 1f2075c7837ac2be1acd97d51fe37d01da4d7c9f Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sun, 15 Jan 2017 19:31:33 -0800 Subject: [PATCH 060/122] Added test scripts with simple commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commands are create , open , drop , add , find , pretty on, pretty off. Commands can span multiple lines and must end with a semi-colon (;). Comments can be added on any line with with hash # character. It must be the first character on the line. pretty on will pretty print the json results. The default is pretty off. Moved most the tests in query into the new test scripts. The tests script are run with cargo test. Failures are reported and failing script output is written to a file in the repl-test directory as .reject so they can be examined. To update or add tests save your changes and then run ./update-repl-tests.sh from the root of the directory. It will then update the test feels with the outputs were they can be examined for correctness. You can run the commands interactively as well, but there is no command completion or repeat commands, so it’s not very fun! --- repl-tests/bind_var.noise | 60 +++ repl-tests/collation.noise | 125 +++++ repl-tests/group.noise | 163 ++++++ repl-tests/not.noise | 96 ++++ repl-tests/query_basic.noise | 227 ++++++++ repl-tests/scoring.noise | 197 +++++++ src/index.rs | 42 +- src/json_value.rs | 101 +++- src/lib.rs | 3 +- src/main.rs | 24 +- src/parser.rs | 16 +- src/query.rs | 966 ++--------------------------------- src/repl.rs | 130 +++++ tests/repl_tests.rs | 57 +++ update-test-repl.sh | 25 + 15 files changed, 1258 insertions(+), 974 deletions(-) create mode 100644 repl-tests/bind_var.noise create mode 100644 repl-tests/collation.noise create mode 100644 repl-tests/group.noise create mode 100644 repl-tests/not.noise create mode 100644 repl-tests/query_basic.noise create mode 100644 repl-tests/scoring.noise create mode 100644 src/repl.rs create mode 100644 tests/repl_tests.rs create mode 100755 update-test-repl.sh diff --git a/repl-tests/bind_var.noise b/repl-tests/bind_var.noise new file mode 100644 index 0000000..59e970f --- /dev/null +++ b/repl-tests/bind_var.noise @@ -0,0 +1,60 @@ +# Bind Variables testing. Feature is not complete + +drop target/tests/querytestbindvar; +create target/tests/querytestbindvar; + +add {"_id":"1", "bar": [{"a":"foo","v":1},{"a":"bar","v":2}]}; +"1" + + +find {bar: x::[{a: =="foo"}]} +return x ; +[ +[{"a":"foo","v":1}] +] + +find {bar: x::[{a: =="foo"}]} +return x.v ; +[ +[1] +] + +find {bar: x::[{a: =="foo" || a: =="bar"}]} +return x.v ; +[ +[1,2] +] + +find {bar: x::[{a: =="foo" || a: =="baz"}]} +return x.v ; +[ +[1] +] + +find {bar: x::[{a: =="foof" || a: =="bar"}]} +return x.v ; +[ +[2] +] + +find {bar: x::[{a: =="foo"}] || bar: x::[{a: =="bar"}]} +return x.v ; +[ +[1,2] +] + +find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="bar"}]} +return [x.v, y.v] ; +[ +[[1],[2]] +] + +find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="baz"}]} +return [x.v, y.v default=0] ; +[ +[[1],[0]] +] + +find {bar: x::[{a: =="foo"}] && bar: y::[{a: =="baz"}]} +return [x.v, y.v] ; +[] diff --git a/repl-tests/collation.noise b/repl-tests/collation.noise new file mode 100644 index 0000000..87c0f4e --- /dev/null +++ b/repl-tests/collation.noise @@ -0,0 +1,125 @@ +# Sort expressions. + +drop target/tests/querytestjsoncollation; +create target/tests/querytestjsoncollation; + +add {"_id":"1", "foo":"coll", "bar": {}}; +"1" +add {"_id":"2", "foo":"coll", "bar": {"foo":"bar"}}; +"2" +add {"_id":"3", "foo":"coll", "bar": {"foo":"baz"}}; +"3" +add {"_id":"4", "foo":"coll", "bar": {"foo":"baz","bar":"baz"}}; +"4" +add {"_id":"5", "foo":"coll", "bar": {"foo":"baz","bar":"bar"}}; +"5" +add {"_id":"6", "foo":"coll", "bar": 1}; +"6" +add {"_id":"7", "foo":"coll", "bar": 1.00001}; +"7" +add {"_id":"8", "foo":"coll", "bar": 2.00001}; +"8" +add {"_id":"9", "foo":"coll", "bar": true}; +"9" +add {"_id":"10", "foo":"coll", "bar": false}; +"10" +add {"_id":"11", "foo":"coll", "bar": null}; +"11" +add {"_id":"12", "foo":"coll", "bar": []}; +"12" +add {"_id":"13", "foo":"coll", "bar": [true]}; +"13" +add {"_id":"14", "foo":"coll", "bar": [null]}; +"14" +add {"_id":"15", "foo":"coll", "bar": "string"}; +"15" +add {"_id":"16", "foo":"coll", "bar": "string2"}; +"16" +add {"_id":"17", "foo":"coll", "bar": "string3"}; +"17" + +find {foo: =="coll"} +sort .bar asc +return .bar ; +[ +null, +false, +true, +1, +1.00001, +2.00001, +"string", +"string2", +"string3", +[], +[null], +[true], +{}, +{"bar":"bar","foo":"baz"}, +{"bar":"baz","foo":"baz"}, +{"foo":"bar"}, +{"foo":"baz"} +] + +find {foo: =="coll"} +sort .bar asc +return .bar +limit 5; +[ +null, +false, +true, +1, +1.00001 +] + +find {foo: =="coll"} +sort .bar asc +return .bar +limit 1; +[ +null +] + +add {"_id":"20", "foo":"coll2", "bar":[1,1,1]}; +"20" +add {"_id":"21", "foo":"coll2", "bar":[1,1,2]}; +"21" +add {"_id":"22", "foo":"coll2", "bar":[1,2,2]}; +"22" +add {"_id":"23", "foo":"coll2", "bar":[2,2,2]}; +"23" +add {"_id":"24", "foo":"coll2", "bar":[2,1,1]}; +"24" +add {"_id":"25", "foo":"coll2", "bar":[2,1,2]}; +"25" +add {"_id":"26", "foo":"coll2", "bar":[2,3,2]}; +"26" +add {"_id":"27", "foo":"coll2", "bar":[3,4,3]}; +"27" +add {"_id":"28", "foo":"coll2", "bar":[5,4,3]}; +"28" +add {"_id":"29", "foo":"coll2", "bar":[5,5,5]}; +"29" + +find {foo: =="coll2"} +sort .bar[0] asc, .bar[1] desc, .bar[2] desc +return [.bar[0], .bar[1], .bar[2]] ; +[ +[1,2,2], +[1,1,2], +[1,1,1], +[2,3,2], +[2,2,2], +[2,1,2], +[2,1,1], +[3,4,3], +[5,5,5], +[5,4,3] +] + +find {foo: =="group2"} +sort .baz asc, .bar desc +return [.baz, .bar] +limit 2; +[] diff --git a/repl-tests/group.noise b/repl-tests/group.noise new file mode 100644 index 0000000..22a2ab4 --- /dev/null +++ b/repl-tests/group.noise @@ -0,0 +1,163 @@ +# Group and aggregate tests + +drop target/tests/querytestgroup; +create target/tests/querytestgroup; + +add {"_id":"1", "foo":"group", "baz": "a", "bar": 1}; +"1" +add {"_id":"2", "foo":"group", "baz": "b", "bar": 2}; +"2" +add {"_id":"3", "foo":"group", "baz": "c", "bar": 3}; +"3" +add {"_id":"4", "foo":"group", "baz": "a", "bar": 1}; +"4" +add {"_id":"5", "foo":"group", "baz": "b", "bar": 2}; +"5" +add {"_id":"6", "foo":"group", "baz": "c", "bar": 3}; +"6" +add {"_id":"7", "foo":"group", "baz": "a", "bar": 1}; +"7" +add {"_id":"8", "foo":"group", "baz": "b", "bar": 2}; +"8" +add {"_id":"9", "foo":"group", "baz": "c", "bar": 3}; +"9" +add {"_id":"10", "foo":"group", "baz": "a", "bar": 1}; +"10" +add {"_id":"11", "foo":"group", "baz": "b", "bar": 2}; +"11" +add {"_id":"12", "foo":"group", "baz": "c", "bar": 3}; +"12" + +find {foo: =="group"} +return {baz: group(.baz), bar: sum(.bar)}; +[ +{"baz":"a","bar":4}, +{"baz":"b","bar":8}, +{"baz":"c","bar":12} +] + +find {foo: =="group"} +return {bar: sum(.bar)}; +[ +{"bar":24} +] + +find {foo: =="group"} +return {bar: avg(.bar)}; +[ +{"bar":2} +] + +find {foo: =="group"} +return {baz: group(.baz), concat: concat(.baz sep="|")}; +[ +{"baz":"a","concat":"a|a|a|a"}, +{"baz":"b","concat":"b|b|b|b"}, +{"baz":"c","concat":"c|c|c|c"} +] + +find {foo: =="group"} +return {baz: group(.baz), list: list(.baz)}; +[ +{"baz":"a","list":["a","a","a","a"]}, +{"baz":"b","list":["b","b","b","b"]}, +{"baz":"c","list":["c","c","c","c"]} +] + +find {foo: =="group"} +return {baz: group(.baz), count: count()}; +[ +{"baz":"a","count":4}, +{"baz":"b","count":4}, +{"baz":"c","count":4} +] + +find {foo: =="group"} +return {max: max(.bar)}; +[ +{"max":3} +] + +find {foo: =="group"} +return {min: min(.bar)}; +[ +{"min":1} +] + +find {foo: =="group"} +return {max: max(.baz)}; +[ +{"max":"c"} +] + +add {"_id":"1", "foo":"group2", "baz": "a", "bar": "a"}; +"1" +add {"_id":"2", "foo":"group2", "baz": "a", "bar": "b"}; +"2" +add {"_id":"3", "foo":"group2", "baz": "b", "bar": "a"}; +"3" +add {"_id":"4", "foo":"group2", "baz": "b", "bar": "b"}; +"4" +add {"_id":"5", "foo":"group2", "baz": "a", "bar": "a"}; +"5" +add {"_id":"6", "foo":"group2", "baz": "a", "bar": "c"}; +"6" +add {"_id":"7", "foo":"group2", "baz": "b", "bar": "d"}; +"7" +add {"_id":"8", "foo":"group2", "baz": "b", "bar": "e"}; +"8" +add {"_id":"9", "foo":"group2", "baz": "a", "bar": "f"}; +"9" + +find {foo: =="group2"} +return [group(.baz order=asc), group(.bar order=desc), count()]; +[ +["a","f",1], +["a","c",1], +["a","b",1], +["a","a",2], +["b","e",1], +["b","d",1], +["b","b",1], +["b","a",1] +] + +find {foo: =="group2"} +return [group(.baz order=asc), group(.bar order=desc), count()] +limit 2; +[ +["a","f",1], +["a","c",1] +] + +add {"_id":"1", "foo":"group3", "baz": "a", "bar": "a"}; +"1" +add {"_id":"2", "foo":"group3", "bar": "b"}; +"2" +add {"_id":"3", "foo":"group3", "baz": "b", "bar": "a"}; +"3" +add {"_id":"4", "foo":"group3", "baz": "b", "bar": "b"}; +"4" +add {"_id":"5", "foo":"group3", "baz": "a", "bar": "a"}; +"5" +add {"_id":"6", "foo":"group3", "baz": "a" }; +"6" +add {"_id":"7", "foo":"group3", "baz": "b", "bar": "d"}; +"7" +add {"_id":"8", "foo":"group3", "baz": "b", "bar": "e"}; +"8" +add {"_id":"9", "foo":"group3", "baz": "a", "bar": "f"}; +"9" + +find {foo: =="group2"} +return [group(.baz order=asc) default="a", group(.bar order=desc) default="c", count()]; +[ +["a","f",1], +["a","c",1], +["a","b",1], +["a","a",2], +["b","e",1], +["b","d",1], +["b","b",1], +["b","a",1] +] diff --git a/repl-tests/not.noise b/repl-tests/not.noise new file mode 100644 index 0000000..9a1d9f3 --- /dev/null +++ b/repl-tests/not.noise @@ -0,0 +1,96 @@ +# Logical not tests + +drop target/tests/querytestnot; +create target/tests/querytestnot; + + +add {"_id":"1", "bar": "fox"}; +"1" +add {"_id":"2", "bar": "quick fox"}; +"2" +add {"_id":"3", "bar": "quick brown fox"}; +"3" +add {"_id":"4", "bar": ["fox"]}; +"4" +add {"_id":"5", "bar": ["quick fox"]}; +"5" +add {"_id":"6", "bar": ["quick brown fox"]}; +"6" +add {"_id":"7", "baz": ["fox"]}; +"7" +add {"_id":"8", "baz": ["quick","fox"]}; +"8" +add {"_id":"9", "baz": ["quick","brown","fox"]}; +"9" + +find {(bar: ~="fox" || bar: ~="brown") && (bar: !~="quick")} +return ._id ; +[ +"1" +] + +find {(bar: ~="fox" || bar: ~="brown") && !(bar: ~="quick")} +return ._id ; +[ +"1" +] + +find {bar: ~="fox" || bar: ~="brown"} && !{bar: ~="quick"} +return ._id ; +[ +"1" +] + +find {bar: [(~="fox" || ~="brown") && !~="quick"]} +return ._id ; +[ +"4" +] + +find {bar: [(~="fox" || ~="brown") && !(~="quick")]} +return ._id ; +[ +"4" +] + +find {bar: [~="fox" || ~="brown"] && bar: ![~="quick"]} +return ._id ; +[ +"4" +] + +find {baz: [(~="fox" || ~="brown") && !~="quick"]} +return ._id ; +[ +"7", +"8", +"9" +] + +find {baz: [(~="fox" || ~="brown") && !(~="quick")]} +return ._id ; +[ +"7", +"8", +"9" +] + +find {baz: [~="fox" || ~="brown"] && baz: ![~="quick"]} +return ._id ; +[ +"7" +] + +# Test for unallowable expressions + +find !{baz: [~="fox"]} +return ._id ; +Parse error: query cannot be made up of only logical not. Must have at least match clause not negated. + +find !{baz: ~="fox"} && !{baz: =="foo"} +return ._id ; +Parse error: Logical not ("!") is nested inside of another logical not. This is not allowed. + +find {foo: =="bar"} && !{baz: !~="fox"} +return ._id ; +Parse error: Logical not ("!") is nested inside of another logical not. This is not allowed. diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise new file mode 100644 index 0000000..648ddcc --- /dev/null +++ b/repl-tests/query_basic.noise @@ -0,0 +1,227 @@ +# Some basic tests + +drop target/tests/querytestdbbasic1; +create target/tests/querytestdbbasic1; + + +add {"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}; +"1" +add {"_id":"2", "A":[{"B":"B2","C":[{"D":"D"}]},{"B": "b1","C":"C2"}]}; +"2" +add {"_id":"3", "A":"Multi word sentence"}; +"3" +add {"_id":"4", "A":"%&%}{}@);€"}; +"4" +add {"_id":"5", "A":"{}€52 deeply \\n\\v "}; +"5" +add {"_id":"6", "A":[{"B":"B3"},{"B": "B3"}]}; +"6" +add {"_id":"7", "A":[{"B":"B3"},{"B": "B4"}]}; +"7" +add {"_id":"8", "A":["A1", "A1"]}; +"8" +add {"_id":"9", "A":["A1", "A2"]}; +"9" +add {"_id":"10", "A":"a bunch of words in this sentence"}; +"10" +add {"_id":"11", "A":""}; +"11" +add {"_id":"12", "A":["1","2","3","4","5","6","7","8","9","10","11","12"]}; +"12" +add {"_id":"13", "A":["foo",1,true,false,null,{},[]]}; +"13" + + +# Exact match object fields in arrays + +find {A:[{B: =="B2", C: [{D: =="D"} ]}]}; +[ +"2" +] + +find {A:[{B: == "B2", C: == "C2"}]}; +[ +"1" +] + +find {A:[{B: == "B2", C: == "C8"}]}; +[] + +find {A:[{B: == "b1", C: == "C2"}]}; +[ +"1", +"2" +] + +# exact match stuff in fields + +find {A: == "Multi word sentence"}; +[ +"3" +] + +find {A: == "%&%}{}@);€"}; +[ +"4" +] + +find {A: == "{}€52 deeply \\n\\v "}; +[ +"5" +] + +find {A:[{C: == "C2"}]}; +[ +"1", +"2" +] + +find {A:[{B: == "B3" || B: == "B4"}]}; +[ +"6", +"7" +] + +# exact match strings in arrays + + + +find {A:[ == "A1" || == "A2"]}; +[ +"8", +"9" +] + +find {A:[ == "A1" && == "A" || == "A1"]}; +[ +"8", +"9" +] + +find {A:[=="A" || == "A1" && == "A"]}; +[] + +# full text search fields + + + +find {A: ~= "Multi"}; +[ +"3" +] + +# phrase match + +find {A: ~= "multi word"}; +[ +"3" +] + +find {A: ~= "word sentence"}; +[ +"3" +] + +find {A: ~= "sentence word"}; +[] + +# proximity match. Number indicates how many word away terms can be. + +find {A: ~1= "multi sentence"}; +[ +"3" +] + +find {A: ~4= "a sentence"}; +[] + +find {A: ~5= "a sentence"}; +[ +"10" +] + +find {A: ~4= "a bunch of words sentence"}; +[] + +find {A: ~5= "a bunch of words sentence"}; +[ +"10" +] + +find {A: ~10= "a bunch of words sentence"}; +[ +"10" +] + +find {A: == ""}; +[ +"11" +] + +# test return json elements + +find {A:[ == "1"]} +return .A ; +[ +["1","2","3","4","5","6","7","8","9","10","11","12"] +] + +find {A:[ == "2"]} +return .A[0] ; +[ +"1" +] + +find {A:[ == "2"]} +return [.A[0], ._id] ; +[ +["1","12"] +] + +find {A:[ == "2"]} +return {foo:.A[0], bar: ._id} ; +[ +{"foo":"1","bar":"12"} +] + +find {A:[ == "foo"]} +return .A ; +[ +["foo",1,true,false,null,{},[]] +] + +# returning null when missing + +find {A:[ == "foo"]} +return .B ; +[ +null +] + +# returning default values when missing + +find {A:[ == "foo"]} +return .B default={foo:"foo"}; +[ +{"foo":"foo"} +] + +find {A:[ == "foo"]} +return .B default={}; +[ +{} +] + +find {A:[ == "foo"]} +return {foo: .B default={bar:"bar"}}; +[ +{"foo":{"bar":"bar"}} +] + +# return every kind of element + +find {A:[ == "foo"]} +return {"a":"a","b":1.123,"true":true,"false":false,"null":null,array:[],object:{}}; +[ +{"a":"a","b":1.123,"true":true,"false":false,"null":null,"array":[],"object":{}} +] diff --git a/repl-tests/scoring.noise b/repl-tests/scoring.noise new file mode 100644 index 0000000..bd43257 --- /dev/null +++ b/repl-tests/scoring.noise @@ -0,0 +1,197 @@ +# Relevancy Scoring tests + +drop target/tests/querytestscore; +create target/tests/querytestscore; + + +add {"_id":"1", "bar": "fox"}; +"1" +add {"_id":"2", "bar": "quick fox"}; +"2" +add {"_id":"3", "bar": "quick brown fox"}; +"3" + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +sort score() desc +return ._id ; +[ +"3", +"2", +"1" +] + +find {bar: ~="quick brown fox"} +sort score() desc +return ._id ; +[ +"3" +] + +find {bar: ~="quick brown fox"} +return score() ; +[ +0.05966803431510925 +] + +find {bar: ~="quick brown fox"^2} +return score() ; +[ +0.1193360686302185 +] + +find {bar: =="quick brown fox"} +return score() ; +[ +0.05966803431510925 +] + +find {bar: =="quick brown fox"^2} +return score() ; +[ +0.1193360686302185 +] + +find {bar: ~2="quick brown fox"} +return score() ; +[ +0.0916677787899971 +] + +find {bar: ~2="quick brown fox"^2} +return score() ; +[ +0.1833355575799942 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +sort score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find ({bar: ~="fox" || bar: ~="brown" || bar: ~="quick"})^2 +sort score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +sort score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"}^2 +sort score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +sort score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar: ~="fox"^2 || (bar: ~="brown" || bar: ~="quick")^2 } +sort score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} +sort score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar: ~="fox"}^2 || {bar: ~="brown" || bar: ~="quick"}^2 +sort score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +add {"_id":"4", "bar": ["fox"]}; +"4" +add {"_id":"5", "bar": ["quick fox"]}; +"5" +add {"_id":"6", "bar": ["quick brown fox"]}; +"6" + +find {bar:[ ~="fox" || ~="brown" || ~="quick"]} +sort score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar:[~="fox" || ~="brown" || ~="quick"]^2} +sort score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar:[ ~="fox" || ~="brown" || ~="quick"]} +sort score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar:[~="fox"]^2 || bar:[~="brown" || ~="quick"]^2} +sort score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] + +find {bar:[ ~="fox" || ~="brown" || ~="quick"]} +sort score() desc +return score() ; +[ +0.5773501992225647, +0.2468651682138443, +0.07121198624372482 +] + +find {bar:[~="fox"]^2 || (bar:[~="brown"] || bar:[~="quick"])^2} +sort score() desc +return score() ; +[ +1.1547003984451294, +0.4937303364276886, +0.14242397248744965 +] diff --git a/src/index.rs b/src/index.rs index eb52fea..2c1de5c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -35,7 +35,7 @@ impl Index { high_doc_seq: 0, rocks: None, id_str_to_id_seq: HashMap::new(), - batch: Some(rocksdb::WriteBatch::default()), + batch: None, } } // NOTE vmx 2016-10-13: Perhpas the name should be specified on `new()` as it is bound @@ -44,6 +44,9 @@ impl Index { //fn open(&mut self, name: &str, open_options: Option) -> Result { pub fn open(&mut self, name: &str, open_options: Option) -> Result<(), Error> { let mut rocks_options = rocksdb::Options::default(); + rocks_options.set_comparator("noise", Index::compare_keys); + rocks_options.set_merge_operator("noise", Index::sum_merge); + let rocks = match rocksdb::DB::open(&rocks_options, name) { Ok(rocks) => rocks, Err(error) => { @@ -53,12 +56,8 @@ impl Index { } rocks_options.create_if_missing(true); - rocks_options.set_comparator("noise", Index::compare_keys); - rocks_options.set_merge_operator("noise", Index::sum_merge); - let rocks = try!(rocksdb::DB::open(&rocks_options, name)); - let mut bytes = Vec::with_capacity(8*2); bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); @@ -78,6 +77,8 @@ impl Index { // next 8 is high seq self.high_doc_seq = Index::convert_bytes_to_u64(&value[8..]); + self.batch = Some(rocksdb::WriteBatch::default()); + Ok(()) } @@ -88,28 +89,29 @@ impl Index { Ok(ret) } - pub fn add(&mut self, json: &str) -> Result<(), Error> { + pub fn add(&mut self, json: &str) -> Result { + if self.rocks.is_none() { + return Err(Error::Write("Index isn't open.".to_string())); + } let mut shredder = Shredder::new(); - // NOTE vmx 2016-10-13: Needed for the lifetime-checker, though not sure if it now really - // does the right thing. Does the `try!()` still return as epected? - { - let docid = try!(shredder.shred(json, self.high_doc_seq + 1, - self.batch.as_mut().unwrap())); - if self.id_str_to_id_seq.contains_key(&docid) { - return Err(Error::Write("Attempt to insert multiple docs with same _id" - .to_string())); - } - self.high_doc_seq += 1; - self.id_str_to_id_seq.insert(format!("I{}", docid), format!("{}", self.high_doc_seq)); + + let docid = try!(shredder.shred(json, self.high_doc_seq + 1, + self.batch.as_mut().unwrap())); + if self.id_str_to_id_seq.contains_key(&docid) { + return Err(Error::Write("Attempt to insert multiple docs with same _id" + .to_string())); } - Ok(()) + self.high_doc_seq += 1; + self.id_str_to_id_seq.insert(format!("I{}", docid), format!("{}", self.high_doc_seq)); + Ok(docid) } // Store the current batch pub fn flush(&mut self) -> Result<(), Error> { // Flush can only be called if the index is open - // NOTE vmx 2016-10-17: Perhaps that shouldn't panic? - assert!(&self.rocks.is_some()); + if self.rocks.is_none() { + return Err(Error::Write("Index isn't open.".to_string())); + } let rocks = self.rocks.as_ref().unwrap(); // Look up all doc ids and 'delete' from the seq_to_ids keyspace diff --git a/src/json_value.rs b/src/json_value.rs index e97c24a..7151cf8 100644 --- a/src/json_value.rs +++ b/src/json_value.rs @@ -125,49 +125,88 @@ impl JsonValue { } } - pub fn render(&self, write: &mut Write) -> Result<(), Error> { + pub fn render(&self, write: &mut Write, pretty: &mut PrettyPrint) -> Result<(), Error> { match self { - &JsonValue::Number(ref num) => try!(write.write_all(num.to_string().as_bytes())), + &JsonValue::Number(ref num) => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all(num.to_string().as_bytes())); + }, &JsonValue::String(ref string) => { + try!(write.write_all(pretty.prefix())); try!(write.write_all(JsonValue::str_to_literal(&string).as_bytes())) }, &JsonValue::Array(ref array) => { + if array.is_empty() { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("[]".as_bytes())); + return Ok(()); + } + try!(write.write_all(pretty.prefix())); try!(write.write_all("[".as_bytes())); + try!(write.write_all(pretty.newline())); + pretty.push(); let mut iter = array.iter().peekable(); loop { match iter.next() { - Some(json) => try!(json.render(write)), + Some(json) => { + try!(json.render(write, pretty)) + }, None => break, } if iter.peek().is_some() { try!(write.write_all(",".as_bytes())); } + try!(write.write_all(pretty.newline())); } + pretty.pop(); + try!(write.write_all(pretty.prefix())); try!(write.write_all("]".as_bytes())); }, &JsonValue::Object(ref object) => { + if object.is_empty() { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("{}".as_bytes())); + return Ok(()); + } + try!(write.write_all(pretty.prefix())); try!(write.write_all("{".as_bytes())); + try!(write.write_all(pretty.newline())); + pretty.push(); let mut iter = object.iter().peekable(); loop { match iter.next() { Some(&(ref key, ref json)) => { + try!(write.write_all(pretty.prefix())); try!(write.write_all(JsonValue::str_to_literal(&key).as_bytes())); try!(write.write_all(":".as_bytes())); - try!(json.render(write)); + pretty.next_prefix_is_space(); + try!(json.render(write, pretty)); } None => break, } if iter.peek().is_some() { try!(write.write_all(",".as_bytes())); } + try!(write.write_all(pretty.newline())); } + pretty.pop(); + try!(write.write_all(pretty.prefix())); try!(write.write_all("}".as_bytes())); }, - &JsonValue::True => try!(write.write_all("true".as_bytes())), - &JsonValue::False => try!(write.write_all("false".as_bytes())), - &JsonValue::Null => try!(write.write_all("null".as_bytes())), + &JsonValue::True => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("true".as_bytes())); + }, + &JsonValue::False => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("false".as_bytes())); + }, + &JsonValue::Null => { + try!(write.write_all(pretty.prefix())); + try!(write.write_all("null".as_bytes())) + }, } Ok(()) } @@ -185,4 +224,50 @@ impl Ord for JsonValue { Ordering::Equal => self_cmp_fun(self, other), } } -} \ No newline at end of file +} + +pub struct PrettyPrint { + indention: String, + newline: String, + spacing: String, + buffer: String, + next_prefix_is_space: bool +} + +impl PrettyPrint { + pub fn new(indention: &str, newline: &str, spacing: &str) -> PrettyPrint { + PrettyPrint { + indention: indention.to_string(), + newline: newline.to_string(), + spacing: spacing.to_string(), + buffer: String::new(), + next_prefix_is_space: false, + } + } + + pub fn push(&mut self) { + self.buffer += &self.indention; + } + + pub fn pop(&mut self) { + let len = self.buffer.len() - self.indention.len(); + self.buffer.truncate(len); + } + + pub fn next_prefix_is_space(&mut self) { + self.next_prefix_is_space = true; + } + + pub fn prefix(&mut self) -> &[u8] { + if self.next_prefix_is_space { + self.next_prefix_is_space = false; + self.spacing.as_bytes() + } else { + self.buffer.as_bytes() + } + } + + pub fn newline(&mut self) -> &[u8] { + self.newline.as_bytes() + } +} diff --git a/src/lib.rs b/src/lib.rs index 04e1dc4..97aaf60 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,9 +3,10 @@ extern crate rocksdb; mod error; mod filters; mod json_shred; -mod json_value; mod key_builder; mod parser; mod stems; +pub mod repl; +pub mod json_value; pub mod index; pub mod query; diff --git a/src/main.rs b/src/main.rs index f4bcf49..df4051d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,18 +1,16 @@ extern crate noise; -use noise::index::{Index, OpenOptions}; -use noise::query::Query; +use noise::repl::repl; -fn main() { - let dbname = "querytestdb"; - let _ = Index::delete(dbname); - - let mut index = Index::new(); - index.open(dbname, Some(OpenOptions::Create)).unwrap(); - let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#); - index.flush().unwrap(); +use std::env; +use std::io::{self, BufReader}; - let mut query_results = Query::get_matches(r#"hello="world""#.to_string(), &index).unwrap(); - //let mut query_results = Query::get_matches(r#"a.b[foo="bar"]"#.to_string(), &index).unwrap(); - println!("query results: {:?}", query_results.get_next_id()); +fn main() { + let mut test_mode = false; + for argument in env::args() { + if argument == "-t" { + test_mode = true; + } + } + repl(&mut BufReader::new(io::stdin()), &mut io::stdout(), test_mode); } diff --git a/src/parser.rs b/src/parser.rs index e8465a1..b5d1394 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -20,16 +20,16 @@ use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWo use rocksdb::Snapshot; -pub struct Parser<'a> { - query: String, +pub struct Parser<'a, 'c> { + query: &'c str, offset: usize, kb: KeyBuilder, pub snapshot: Snapshot<'a>, pub needs_scoring: bool, } -impl<'a> Parser<'a> { - pub fn new(query: String, snapshot: Snapshot<'a>) -> Parser<'a> { +impl<'a, 'c> Parser<'a, 'c> { + pub fn new(query: &'c str, snapshot: Snapshot<'a>) -> Parser<'a, 'c> { Parser { query: query, offset: 0, @@ -1084,14 +1084,14 @@ mod tests { let rocks = &index.rocks.unwrap(); let mut snapshot = Snapshot::new(rocks); - let mut query = " \n \t test".to_string(); + let query = " \n \t test"; let mut parser = Parser::new(query, snapshot); parser.ws(); assert_eq!(parser.offset, 5); snapshot = Snapshot::new(rocks); - query = "test".to_string(); - parser = Parser::new(query, snapshot); + let query = "test".to_string(); + let mut parser = Parser::new(&query, snapshot); parser.ws(); assert_eq!(parser.offset, 0); } @@ -1107,7 +1107,7 @@ mod tests { let snapshot = Snapshot::new(rocks); let query = r#"" \n \t test""#.to_string(); - let mut parser = Parser::new(query, snapshot); + let mut parser = Parser::new(&query, snapshot); assert_eq!(parser.must_consume_string_literal().unwrap(), " \n \t test".to_string()); } } \ No newline at end of file diff --git a/src/query.rs b/src/query.rs index c6ebf44..ada6d5f 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,7 +1,6 @@ use std::str; use std::cmp::Ordering; -use std::io::Write; use std::collections::HashMap; use std::iter::Peekable; use std::mem::{transmute, swap}; @@ -13,7 +12,7 @@ use error::Error; use index::Index; use key_builder::{KeyBuilder, Segment}; use parser::Parser; -use json_value::JsonValue; +use json_value::{JsonValue}; use filters::QueryRuntimeFilter; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs @@ -160,7 +159,7 @@ pub struct QueryScoringInfo { pub struct Query {} impl Query { - pub fn get_matches<'a>(query: String, index: &'a Index) -> Result, Error> { + pub fn get_matches<'a>(query: &str, index: &'a Index) -> Result, Error> { if index.rocks.is_none() { return Err(Error::Parse("You must open the index first".to_string())); } @@ -299,7 +298,6 @@ impl Query { iter: parser.snapshot.iterator(IteratorMode::Start), snapshot: parser.snapshot, returnable: returnable, - buffer: Vec::new(), needs_sorting_and_ags: needs_sorting_and_ags, done_with_sorting_and_ags: false, does_group_or_aggr: does_group_or_aggr, @@ -316,13 +314,13 @@ impl Query { } } + pub struct QueryResults<'a> { filter: Box, doc_result_next: DocResult, snapshot: Snapshot<'a>, iter: DBIterator, returnable: Box, - buffer: Vec, needs_sorting_and_ags: bool, done_with_sorting_and_ags: bool, does_group_or_aggr: bool, @@ -392,7 +390,7 @@ impl<'a> QueryResults<'a> { } } - pub fn next_result(&mut self) -> Result, Error> { + pub fn next_result(&mut self) -> Result, Error> { if self.needs_sorting_and_ags { loop { let next = if self.done_with_sorting_and_ags { @@ -424,11 +422,8 @@ impl<'a> QueryResults<'a> { } } } - if let Some(mut result) = self.sorted_buffer.pop() { - self.buffer.clear(); - try!(self.returnable.write_result(&mut result, &mut self.buffer)); - let str = unsafe{str::from_utf8_unchecked(&self.buffer[..])}; - return Ok(Some(str.to_string())); + if let Some(mut results) = self.sorted_buffer.pop() { + return Ok(Some(try!(self.returnable.json_result(&mut results)))); } else { return Ok(None); } @@ -444,9 +439,7 @@ impl<'a> QueryResults<'a> { let mut results = VecDeque::new(); try!(self.returnable.fetch_result(&mut self.iter, dr.seq, score, &dr.bind_name_result, &mut results)); - self.buffer.clear(); - try!(self.returnable.write_result(&mut results, &mut self.buffer)); - Ok(Some(unsafe{str::from_utf8_unchecked(&self.buffer[..])}.to_string())) + Ok(Some(try!(self.returnable.json_result(&mut results)))) } } @@ -631,7 +624,17 @@ impl<'a> QueryResults<'a> { } } +impl<'a> Iterator for QueryResults<'a> { + type Item = Result; + fn next(&mut self) -> Option> { + match self.next_result() { + Ok(Some(json)) => Some(Ok(json)), + Ok(None) => None, + Err(reason) => Some(Err(reason)), + } + } +} #[derive(PartialEq, Eq, Clone)] @@ -852,8 +855,7 @@ pub trait Returnable { fn take_sort_for_matching_fields(&mut self, map: &mut HashMap); - fn write_result(&self, results: &mut VecDeque, - write: &mut Write) -> Result<(), Error>; + fn json_result(&self, results: &mut VecDeque) -> Result; } pub struct RetObject { @@ -888,25 +890,12 @@ impl Returnable for RetObject { } } - fn write_result(&self, results: &mut VecDeque, - write: &mut Write) -> Result<(), Error> { - try!(write.write_all("{".as_bytes())); - let mut iter = self.fields.iter().peekable(); - loop { - match iter.next() { - Some(&(ref key, ref returnable)) => { - try!(write.write_all(JsonValue::str_to_literal(key).as_bytes())); - try!(write.write_all(":".as_bytes())); - try!(returnable.write_result(results, write)); - }, - None => break, - } - if iter.peek().is_some() { - try!(write.write_all(",".as_bytes())); - } + fn json_result(&self, results: &mut VecDeque) -> Result { + let mut vec = Vec::with_capacity(self.fields.len()); + for &(ref key, ref returnable) in self.fields.iter() { + vec.push((key.clone(), try!(returnable.json_result(results)))); } - try!(write.write_all("}".as_bytes())); - Ok(()) + Ok(JsonValue::Object(vec)) } } @@ -943,22 +932,12 @@ impl Returnable for RetArray { } } - fn write_result(&self, results: &mut VecDeque, - write: &mut Write) -> Result<(), Error> { - - try!(write.write_all("[".as_bytes())); - let mut iter = self.slots.iter().peekable(); - loop { - match iter.next() { - Some(ref returnable) => try!(returnable.write_result(results, write)), - None => break, - } - if iter.peek().is_some() { - try!(write.write_all(",".as_bytes())); - } + fn json_result(&self, results: &mut VecDeque) -> Result { + let mut vec = Vec::with_capacity(self.slots.len()); + for slot in self.slots.iter() { + vec.push(try!(slot.json_result(results))); } - try!(write.write_all("]".as_bytes())); - Ok(()) + Ok(JsonValue::Array(vec)) } } @@ -996,13 +975,12 @@ impl Returnable for RetHidden { self.visible.take_sort_for_matching_fields(map); } - fn write_result(&self, results: &mut VecDeque, - write: &mut Write) -> Result<(), Error> { + fn json_result(&self, results: &mut VecDeque) -> Result { for _n in 0..self.unrendered.len() { // we already sorted at this point, now discard the values results.pop_front(); } - self.visible.write_result(results, write) + self.visible.json_result(results) } } @@ -1029,10 +1007,8 @@ impl Returnable for RetLiteral { //noop } - fn write_result(&self, _results: &mut VecDeque, - write: &mut Write) -> Result<(), Error> { - - self.json.render(write) + fn json_result(&self, _results: &mut VecDeque) -> Result { + Ok(self.json.clone()) } } @@ -1227,14 +1203,12 @@ impl Returnable for RetValue { } } - fn write_result(&self, results: &mut VecDeque, - write: &mut Write) -> Result<(), Error> { + fn json_result(&self, results: &mut VecDeque) -> Result { if let Some(json) = results.pop_front() { - try!(json.render(write)); + Ok(json) } else { panic!("missing result!"); } - Ok(()) } } @@ -1297,14 +1271,12 @@ impl Returnable for RetBind { } } - fn write_result(&self, results: &mut VecDeque, - write: &mut Write) -> Result<(), Error> { + fn json_result(&self, results: &mut VecDeque) -> Result { if let Some(json) = results.pop_front() { - try!(json.render(write)); + Ok(json) } else { - panic!("missing result!"); + panic!("missing bind result!"); } - Ok(()) } } @@ -1336,14 +1308,12 @@ impl Returnable for RetScore { } } - fn write_result(&self, results: &mut VecDeque, - write: &mut Write) -> Result<(), Error> { + fn json_result(&self, results: &mut VecDeque) -> Result { if let Some(json) = results.pop_front() { - try!(json.render(write)); + Ok(json) } else { - panic!("missing result!"); + panic!("missing score result!"); } - Ok(()) } } @@ -1355,8 +1325,6 @@ mod tests { use index::{Index, OpenOptions}; - - #[test] fn test_query_hello_world() { let dbname = "target/tests/querytestdbhelloworld"; @@ -1367,861 +1335,11 @@ mod tests { let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#); index.flush().unwrap(); - let mut query_results = Query::get_matches(r#"find {hello:=="world"}"#.to_string(), &index).unwrap(); + let mut query_results = Query::get_matches(r#"find {hello:=="world"}"#, &index).unwrap(); //let mut query_results = Query::get_matches(r#"a.b[foo="bar"]"#.to_string(), &index).unwrap(); println!("query results: {:?}", query_results.get_next_id()); } - #[test] - fn test_query_basic() { - let dbname = "target/tests/querytestdbbasic"; - let _ = Index::delete(dbname); - - let mut index = Index::new(); - index.open(dbname, Some(OpenOptions::Create)).unwrap(); - let _ = index.add(r#"{"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}"#); - let _ = index.add(r#"{"_id":"2", "A":[{"B":"B2","C":[{"D":"D"}]},{"B": "b1","C":"C2"}]}"#); - let _ = index.add(r#"{"_id":"3", "A":"Multi word sentence"}"#); - let _ = index.add(r#"{"_id":"4", "A":"%&%}{}@);€"}"#); - let _ = index.add(r#"{"_id":"5", "A":"{}€52 deeply \\n\\v "}"#); - let _ = index.add(r#"{"_id":"6", "A":[{"B":"B3"},{"B": "B3"}]}"#); - let _ = index.add(r#"{"_id":"7", "A":[{"B":"B3"},{"B": "B4"}]}"#); - let _ = index.add(r#"{"_id":"8", "A":["A1", "A1"]}"#); - let _ = index.add(r#"{"_id":"9", "A":["A1", "A2"]}"#); - let _ = index.add(r#"{"_id":"10", "A":"a bunch of words in this sentence"}"#); - let _ = index.add(r#"{"_id":"11", "A":""}"#); - let _ = index.add(r#"{"_id":"12", "A":["1","2","3","4","5","6","7","8","9","10","11","12"]}"#); - let _ = index.add(r#"{"_id":"13", "A":["foo",1,true,false,null,{},[]]}"#); - - index.flush().unwrap(); - - let mut query_results = Query::get_matches(r#"find {A:[{B: =="B2", C: [{D: =="D"} ]}]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[{B: == "B2", C: == "C2"}]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[{B: == "B2", C: == "C8"}]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[{B: == "b1", C: == "C2"}]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: == "Multi word sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: == "%&%}{}@);€"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("4".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: == "{}€52 deeply \\n\\v "}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("5".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[{C: == "C2"}]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("1".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), Some("2".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[{B: == "B3" || B: == "B4"}]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("6".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), Some("7".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "A1" || == "A2"]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("8".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), Some("9".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "A1" && == "A" || == "A1"]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("8".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), Some("9".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[=="A" || == "A1" && == "A"]}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~= "Multi"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~= "multi word"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~= "word sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~= "sentence word"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~1= "multi sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("3".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~4= "a sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~5= "a sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~4= "a bunch of words sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~5= "a bunch of words sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: ~10= "a bunch of words sentence"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("10".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A: == ""}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.get_next_id().unwrap(), Some("11".to_string())); - assert_eq!(query_results.get_next_id().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "1"]} - return .A "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(), - Some(r#"["1","2","3","4","5","6","7","8","9","10","11","12"]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "2"]} - return .A[0] "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "2"]} - return [.A[0], ._id] "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["1","12"]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "2"]} - return {foo:.A[0], bar: ._id} "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"1","bar":"12"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "foo"]} - return .A "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["foo",1,true,false,null,{},[]]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - query_results = Query::get_matches(r#"find {A:[ == "foo"]} - return .B "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - query_results = Query::get_matches(r#"find {A:[ == "foo"]} - return .B default={foo:"foo"}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"foo"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - query_results = Query::get_matches(r#"find {A:[ == "foo"]} - return .B default={}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - query_results = Query::get_matches(r#"find {A:[ == "foo"]} - return {foo: .B default={bar:"bar"}}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":{"bar":"bar"}}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - query_results = Query::get_matches(r#"find {A:[ == "foo"]} - return {"a":"a","b":1.123,"true":true,"false":false,"null":null,array:[],object:{}}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(), - Some(r#"{"a":"a","b":1.123,"true":true,"false":false,"null":null,"array":[],"object":{}}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - } - - #[test] - fn test_query_group() { - let dbname = "target/tests/querytestgroup"; - let _ = Index::delete(dbname); - - let mut index = Index::new(); - index.open(dbname, Some(OpenOptions::Create)).unwrap(); - - - let _ = index.add(r#"{"_id":"1", "foo":"group", "baz": "a", "bar": 1}"#); - let _ = index.add(r#"{"_id":"2", "foo":"group", "baz": "b", "bar": 2}"#); - let _ = index.add(r#"{"_id":"3", "foo":"group", "baz": "c", "bar": 3}"#); - let _ = index.add(r#"{"_id":"4", "foo":"group", "baz": "a", "bar": 1}"#); - let _ = index.add(r#"{"_id":"5", "foo":"group", "baz": "b", "bar": 2}"#); - let _ = index.add(r#"{"_id":"6", "foo":"group", "baz": "c", "bar": 3}"#); - let _ = index.add(r#"{"_id":"7", "foo":"group", "baz": "a", "bar": 1}"#); - let _ = index.add(r#"{"_id":"8", "foo":"group", "baz": "b", "bar": 2}"#); - let _ = index.add(r#"{"_id":"9", "foo":"group", "baz": "c", "bar": 3}"#); - let _ = index.add(r#"{"_id":"10", "foo":"group", "baz": "a", "bar": 1}"#); - let _ = index.add(r#"{"_id":"11", "foo":"group", "baz": "b", "bar": 2}"#); - let _ = index.add(r#"{"_id":"12", "foo":"group", "baz": "c", "bar": 3}"#); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {baz: group(.baz), bar: sum(.bar)}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","bar":4}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","bar":8}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","bar":12}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {bar: sum(.bar)}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":24}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {bar: avg(.bar)}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":2}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {baz: group(.baz), concat: concat(.baz sep="|")}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","concat":"a|a|a|a"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","concat":"b|b|b|b"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","concat":"c|c|c|c"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {baz: group(.baz), list: list(.baz)}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","list":["a","a","a","a"]}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","list":["b","b","b","b"]}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","list":["c","c","c","c"]}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {baz: group(.baz), count: count()}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"a","count":4}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"b","count":4}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"baz":"c","count":4}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {max: max(.bar)}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"max":3}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {min: min(.bar)}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"min":1}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - - let mut query_results = Query::get_matches(r#"find {foo: =="group"} - return {max: max(.baz)}"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"max":"c"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - let _ = index.add(r#"{"_id":"1", "foo":"group2", "baz": "a", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"2", "foo":"group2", "baz": "a", "bar": "b"}"#); - let _ = index.add(r#"{"_id":"3", "foo":"group2", "baz": "b", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"4", "foo":"group2", "baz": "b", "bar": "b"}"#); - let _ = index.add(r#"{"_id":"5", "foo":"group2", "baz": "a", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"6", "foo":"group2", "baz": "a", "bar": "c"}"#); - let _ = index.add(r#"{"_id":"7", "foo":"group2", "baz": "b", "bar": "d"}"#); - let _ = index.add(r#"{"_id":"8", "foo":"group2", "baz": "b", "bar": "e"}"#); - let _ = index.add(r#"{"_id":"9", "foo":"group2", "baz": "a", "bar": "f"}"#); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {foo: =="group2"} - return [group(.baz order=asc), group(.bar order=desc), count()]"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","b",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","a",2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","e",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","d",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","b",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","a",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {foo: =="group2"} - return [group(.baz order=asc), group(.bar order=desc), count()] - limit 2"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - let _ = index.add(r#"{"_id":"1", "foo":"group3", "baz": "a", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"2", "foo":"group3", "bar": "b"}"#); - let _ = index.add(r#"{"_id":"3", "foo":"group3", "baz": "b", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"4", "foo":"group3", "baz": "b", "bar": "b"}"#); - let _ = index.add(r#"{"_id":"5", "foo":"group3", "baz": "a", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"6", "foo":"group3", "baz": "a", }"#); - let _ = index.add(r#"{"_id":"7", "foo":"group3", "baz": "b", "bar": "d"}"#); - let _ = index.add(r#"{"_id":"8", "foo":"group3", "baz": "b", "bar": "e"}"#); - let _ = index.add(r#"{"_id":"9", "foo":"group3", "baz": "a", "bar": "f"}"#); - - index.flush().unwrap(); - - - let mut query_results = Query::get_matches(r#"find {foo: =="group2"} - return [group(.baz order=asc) default="a", group(.bar order=desc) default="c", count()]"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","b",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","a",2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","e",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","d",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","b",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["b","a",1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - - } - - - #[test] - fn test_query_json_collation() { - let dbname = "target/tests/querytestjsoncollation"; - - let _ = Index::delete(dbname); - - let mut index = Index::new(); - index.open(dbname, Some(OpenOptions::Create)).unwrap(); - - - assert_eq!(Ok(()), index.add(r#"{"_id":"1", "foo":"coll", "bar": {}}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"2", "foo":"coll", "bar": {"foo":"bar"}}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"3", "foo":"coll", "bar": {"foo":"baz"}}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"4", "foo":"coll", "bar": {"foo":"baz","bar":"baz"}}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"5", "foo":"coll", "bar": {"foo":"baz","bar":"bar"}}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"6", "foo":"coll", "bar": 1}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"7", "foo":"coll", "bar": 1.00001}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"8", "foo":"coll", "bar": 2.00001}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"9", "foo":"coll", "bar": true}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"10", "foo":"coll", "bar": false}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"11", "foo":"coll", "bar": null}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"12", "foo":"coll", "bar": []}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"13", "foo":"coll", "bar": [true]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"14", "foo":"coll", "bar": [null]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"15", "foo":"coll", "bar": "string"}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"16", "foo":"coll", "bar": "string2"}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"17", "foo":"coll", "bar": "string3"}"#)); - - index.flush().unwrap(); - - - { - let mut query_results = Query::get_matches(r#"find {foo: =="coll"} - sort .bar asc - return .bar "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"false"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"true"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"1"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"1.00001"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"2.00001"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""string""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""string2""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""string3""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[null]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[true]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":"bar","foo":"baz"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"bar":"baz","foo":"baz"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"bar"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"{"foo":"baz"}"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {foo: =="coll"} - sort .bar asc - return .bar - limit 5"#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"false"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"true"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"1"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"1.00001"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {foo: =="coll"} - sort .bar asc - return .bar - limit 1"#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"null"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - assert_eq!(Ok(()), index.add(r#"{"_id":"20", "foo":"coll2", "bar":[1,1,1]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"21", "foo":"coll2", "bar":[1,1,2]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"22", "foo":"coll2", "bar":[1,2,2]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"23", "foo":"coll2", "bar":[2,2,2]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"24", "foo":"coll2", "bar":[2,1,1]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"25", "foo":"coll2", "bar":[2,1,2]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"26", "foo":"coll2", "bar":[2,3,2]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"27", "foo":"coll2", "bar":[3,4,3]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"28", "foo":"coll2", "bar":[5,4,3]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"29", "foo":"coll2", "bar":[5,5,5]}"#)); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {foo: =="coll2"} - sort .bar[0] asc, .bar[1] desc, .bar[2] desc - return [.bar[0], .bar[1], .bar[2]] "#.to_string(), &index).unwrap(); - - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,2,2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,1,2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,1,1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,3,2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,2,2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,1,2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[2,1,1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[3,4,3]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[5,5,5]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"[5,4,3]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - - let _ = index.add(r#"{"_id":"1", "foo":"group2", "baz": "a", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"2", "foo":"group2", "baz": "a", "bar": "b"}"#); - let _ = index.add(r#"{"_id":"3", "foo":"group2", "baz": "b", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"4", "foo":"group2", "baz": "b", "bar": "b"}"#); - let _ = index.add(r#"{"_id":"5", "foo":"group2", "baz": "a", "bar": "a"}"#); - let _ = index.add(r#"{"_id":"6", "foo":"group2", "baz": "a", "bar": "c"}"#); - let _ = index.add(r#"{"_id":"7", "foo":"group2", "baz": "b", "bar": "d"}"#); - let _ = index.add(r#"{"_id":"8", "foo":"group2", "baz": "b", "bar": "e"}"#); - let _ = index.add(r#"{"_id":"9", "foo":"group2", "baz": "a", "bar": "f"}"#); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {foo: =="group2"} - sort .baz asc, .bar desc - return [.baz, .bar] - limit 2"#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","f"]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#"["a","c"]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - } - - - #[test] - fn test_query_bind_var() { - let dbname = "target/tests/querytestbindvar"; - - let _ = Index::delete(dbname); - - let mut index = Index::new(); - index.open(dbname, Some(OpenOptions::Create)).unwrap(); - - - assert_eq!(Ok(()), index.add(r#"{"_id":"1", "bar": [{"a":"foo","v":1},{"a":"bar","v":2}]}"#)); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}]} - return x "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[{"a":"foo","v":1}]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}]} - return x.v "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo" || a: =="bar"}]} - return x.v "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo" || a: =="baz"}]} - return x.v "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[1]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foof" || a: =="bar"}]} - return x.v "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] || bar: x::[{a: =="bar"}]} - return x.v "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[1,2]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="bar"}]} - return [x.v, y.v] "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[[1],[2]]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] || bar: y::[{a: =="baz"}]} - return [x.v, y.v default=0] "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#"[[1],[0]]"#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: x::[{a: =="foo"}] && bar: y::[{a: =="baz"}]} - return [x.v, y.v] "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(), None); - } - } - - #[test] - fn test_query_score() { - let dbname = "target/tests/querytestscore"; - - let _ = Index::delete(dbname); - - let mut index = Index::new(); - index.open(dbname, Some(OpenOptions::Create)).unwrap(); - - assert_eq!(Ok(()), index.add(r#"{"_id":"1", "bar": "fox"}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"2", "bar": "quick fox"}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"3", "bar": "quick brown fox"}"#)); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} - sort score() desc - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""3""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""2""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - - { - let mut query_results = Query::get_matches(r#"find {bar: ~="quick brown fox"} - sort score() desc - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""3""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - - { - //score boosting - let mut query_results = Query::get_matches(r#"find {bar: ~="quick brown fox"} - return score() "#.to_string(), &index).unwrap(); - - - let mut query_results2 = Query::get_matches(r#"find {bar: ~="quick brown fox"^2} - return score() "#.to_string(), &index).unwrap(); - - - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: =="quick brown fox"} - return score() "#.to_string(), &index).unwrap(); - - - let mut query_results2 = Query::get_matches(r#"find {bar: =="quick brown fox"^2} - return score() "#.to_string(), &index).unwrap(); - - - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: ~2="quick brown fox"} - return score() "#.to_string(), &index).unwrap(); - - - let mut query_results2 = Query::get_matches(r#"find {bar: ~2="quick brown fox"^2} - return score() "#.to_string(), &index).unwrap(); - - - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - let mut query_results2 = Query::get_matches(r#"find ({bar: ~="fox" || bar: ~="brown" || bar: ~="quick"})^2 - sort score() desc - return score() "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - let mut query_results2 = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"}^2 - sort score() desc - return score() "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - let mut query_results2 = Query::get_matches(r#"find {bar: ~="fox"^2 || (bar: ~="brown" || bar: ~="quick")^2 } - sort score() desc - return score() "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - let mut query_results2 = Query::get_matches(r#"find {bar: ~="fox"}^2 || {bar: ~="brown" || bar: ~="quick"}^2 - sort score() desc - return score() "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - assert_eq!(Ok(()), index.add(r#"{"_id":"4", "bar": ["fox"]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"5", "bar": ["quick fox"]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"6", "bar": ["quick brown fox"]}"#)); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {bar:[ ~="fox" || ~="brown" || ~="quick"]} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - let mut query_results2 = Query::get_matches(r#"find {bar:[~="fox" || ~="brown" || ~="quick"]^2} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - { - let mut query_results = Query::get_matches(r#"find {bar:[ ~="fox" || ~="brown" || ~="quick"]} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - let mut query_results2 = Query::get_matches(r#"find {bar:[~="fox"]^2 || bar:[~="brown" || ~="quick"]^2} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - { - let mut query_results = Query::get_matches(r#"find {bar:[ ~="fox" || ~="brown" || ~="quick"]} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - let mut query_results2 = Query::get_matches(r#"find {bar:[~="fox"]^2 || (bar:[~="brown"] || bar:[~="quick"])^2} - sort score() desc - return score() "#.to_string(), &index).unwrap(); - assert_eq!(query_results.next_result().unwrap().unwrap().parse::().unwrap()*2.0, - query_results2.next_result().unwrap().unwrap().parse::().unwrap()); - } - - } - - #[test] - fn test_query_not() { - let dbname = "target/tests/querytestnot"; - - let _ = Index::delete(dbname); - - let mut index = Index::new(); - index.open(dbname, Some(OpenOptions::Create)).unwrap(); - - assert_eq!(Ok(()), index.add(r#"{"_id":"1", "bar": "fox"}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"2", "bar": "quick fox"}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"3", "bar": "quick brown fox"}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"4", "bar": ["fox"]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"5", "bar": ["quick fox"]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"6", "bar": ["quick brown fox"]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"7", "baz": ["fox"]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"8", "baz": ["quick","fox"]}"#)); - assert_eq!(Ok(()), index.add(r#"{"_id":"9", "baz": ["quick","brown","fox"]}"#)); - - index.flush().unwrap(); - - { - let mut query_results = Query::get_matches(r#"find {(bar: ~="fox" || bar: ~="brown") && (bar: !~="quick")} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {(bar: ~="fox" || bar: ~="brown") && !(bar: ~="quick")} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: ~="fox" || bar: ~="brown"} && !{bar: ~="quick"} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""1""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: [(~="fox" || ~="brown") && !~="quick"]} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""4""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {bar: [(~="fox" || ~="brown") && !(~="quick")]} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""4""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {bar: [~="fox" || ~="brown"] && bar: ![~="quick"]} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""4""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {baz: [(~="fox" || ~="brown") && !~="quick"]} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""7""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""8""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""9""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let mut query_results = Query::get_matches(r#"find {baz: [(~="fox" || ~="brown") && !(~="quick")]} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""7""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""8""#.to_string())); - assert_eq!(query_results.next_result().unwrap(),Some(r#""9""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - - { - let mut query_results = Query::get_matches(r#"find {baz: [~="fox" || ~="brown"] && baz: ![~="quick"]} - return ._id "#.to_string(), &index).unwrap(); - - assert_eq!(query_results.next_result().unwrap(),Some(r#""7""#.to_string())); - assert_eq!(query_results.next_result().unwrap(), None); - } - - { - let result = Query::get_matches(r#"find !{baz: [~="fox"]} - return ._id "#.to_string(), &index); - match result { - Ok(_foo) => panic!("Didn't detect all logical nots."), - Err(_foo) => (), - } - } - - - { - let result = Query::get_matches(r#"find !{baz: ~="fox"} && !{baz: =="foo"} - return ._id "#.to_string(), &index); - match result { - Ok(_foo) => panic!("Didn't detect all logical nots."), - Err(_foo) => (), - } - } - - - { - let result = Query::get_matches(r#"find {foo: =="bar"} && !{baz: !~="fox"}} - return ._id "#.to_string(), &index); - match result { - Ok(_foo) => panic!("Didn't detect nested logical nots."), - Err(_foo) => (), - } - } - - } - #[test] fn test_query_more_docs() { let dbname = "target/tests/querytestdbmoredocs"; @@ -2236,7 +1354,7 @@ mod tests { } index.flush().unwrap(); - let mut query_results = Query::get_matches(r#"find {data: == "u"}"#.to_string(), &index).unwrap(); + let mut query_results = Query::get_matches(r#"find {data: == "u"}"#, &index).unwrap(); loop { match query_results.get_next_id() { Ok(Some(result)) => println!("result: {}", result), diff --git a/src/repl.rs b/src/repl.rs new file mode 100644 index 0000000..8c092ad --- /dev/null +++ b/src/repl.rs @@ -0,0 +1,130 @@ +use index::{Index, OpenOptions}; +use query::Query; +use json_value::{JsonValue, PrettyPrint}; + +use std::io::{Write, BufRead}; + + +fn is_command(str: &str) -> bool { + let commands = ["find", "add", "create", "drop", "open", "pretty"]; + for command in commands.iter() { + if str.starts_with(command) { + return true; + } + } + false +} + +pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { + let mut index = Index::new(); + let mut lines = String::new(); + let mut pretty = PrettyPrint::new("", "", ""); + loop { + // read in command until we get to a end semi-colon + if r.read_line(&mut lines).unwrap() > 0 { + if test_mode && lines == "\n" || lines.starts_with("#") { + // we preserve blank lines and comments in test mode + w.write_all(lines.as_bytes()).unwrap(); + lines.clear(); + continue; + } + if test_mode && !is_command(&lines) { + // we drop non-command lines + lines.clear(); + continue; + } else if !is_command(&lines) { + w.write_all(b"Unrecognized command!\n").unwrap(); + lines.clear(); + continue; + } + // check for end semi-colon + if !lines.trim_right().ends_with(";") { + while r.read_line(&mut lines).unwrap() > 0 { + // loop until we get the end semi-colon + if lines.trim_right().ends_with(";") { + break; + } + } + } + } else { + return; + } + if test_mode { + // echo the command + w.write_all(lines.as_bytes()).unwrap(); + } + lines = lines.trim_right().to_string(); + if lines.ends_with(";") { + // strip the semi-colon off + lines.pop(); + } else { + write!(w, "Unterminated command, no semi-colon (;) {}\n", lines).unwrap(); + } + + if lines.starts_with("pretty") { + if lines[6..].trim_left().starts_with("on") { + pretty = PrettyPrint::new(" ", "\n", " "); + } else { + pretty = PrettyPrint::new("", "", ""); + } + } else if lines.starts_with("create") { + let dbname = lines[6..].trim_left(); + match index.open(dbname, Some(OpenOptions::Create)) { + Ok(()) => (), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("drop") { + let dbname = lines[4..].trim_left(); + match Index::delete(dbname) { + Ok(()) => (), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("open") { + let dbname = lines[4..].trim_left(); + match index.open(dbname, None) { + Ok(()) => (), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("add") { + match index.add(&lines[3..]) { + Ok(id) => write!(w, "{}\n", JsonValue::str_to_literal(&id)).unwrap(), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("find") { + if let Err(reason) = index.flush() { + write!(w, "{}\n", reason).unwrap(); + } else { + match Query::get_matches(&lines, &index) { + Ok(results) => { + let mut results = results.peekable(); + + w.write_all(b"[").unwrap(); + if results.peek().is_some() { + w.write_all(b"\n").unwrap(); + } + pretty.push(); + while let Some(result) = results.next() { + match result { + Ok(json) => { + json.render(w, &mut pretty).unwrap(); + if results.peek().is_some() { + w.write_all(b",").unwrap(); + } + w.write_all(b"\n").unwrap(); + }, + Err(reason) => { + write!(w, "{}\n", reason).unwrap(); + }, + } + } + w.write_all(b"]\n").unwrap(); + }, + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } + } + lines.clear(); + } +} + + diff --git a/tests/repl_tests.rs b/tests/repl_tests.rs new file mode 100644 index 0000000..bd85d7e --- /dev/null +++ b/tests/repl_tests.rs @@ -0,0 +1,57 @@ +extern crate noise; + +use std::io::{Read, Write, BufReader}; +use std::fs::{self, File}; +use std::env; + +use noise::repl::repl; + +#[test] +fn test_repl() { + // We load up tests scripts from repl-tests and evaluate them. The output should be idenitical + // to the test script files. If not, then the test is failed and a new file is written with + // .reject extension in the same directory where it can be investigated. + + // To update the test files with new command and output, simply edit/add commands and run + // update-test-repl.sh script from the project root directory. Then examin or do a git diff to see + // if the output is as expected. + + let mut test_dir = env::current_dir().unwrap(); + test_dir.push("repl-tests"); + let mut failures = 0; + let mut total = 0; + for entry in fs::read_dir(test_dir).unwrap() { + let mut path = entry.unwrap().path(); + if path.extension().unwrap().to_str().unwrap() != "noise" { + continue; + } + total += 1; + let mut file = File::open(path.clone()).unwrap(); + let mut file_buffer = Vec::new(); + file.read_to_end(&mut file_buffer).unwrap(); + + let mut test_result_buffer = Vec::new(); + let file = File::open(path.clone()).unwrap(); + + repl(&mut BufReader::new(file), &mut test_result_buffer, true); + + if file_buffer != test_result_buffer { + failures += 1; + let test_name = path.file_name().unwrap().to_str().unwrap().to_string(); + path.set_extension("reject"); + let reject = path.file_name().unwrap().to_str().unwrap().to_string(); + + let mut file = File::create(path.clone()).unwrap(); + file.write_all(&test_result_buffer).unwrap(); + file.sync_all().unwrap(); + + println!("Repl test {} failure. Failing output written to {} in repl-tests dir.", + test_name, reject); + } else { + println!("{} successful", path.file_name().unwrap().to_str().unwrap().to_string()); + } + } + if failures > 0 { + panic!("Failed {} tests in repl-test out of {}", failures, total); + } +} \ No newline at end of file diff --git a/update-test-repl.sh b/update-test-repl.sh new file mode 100755 index 0000000..f127601 --- /dev/null +++ b/update-test-repl.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# This file runs the tests + +SCRIPTPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +DIRNAME="$(dirname ${SCRIPTPATH})" + +NOISE="${DIRNAME}/target/debug/noise" +REPL_TEST_DIR="${DIRNAME}/repl-tests" + +if [[ ! -f "${NOISE}" ]]; then + echo "Can't find noise binary, looked at ${NOISE}" + exit 1 +fi + +REPL_TESTS="${REPL_TEST_DIR}/*.noise" +for f in $REPL_TESTS +do + echo -n "Testing: ${f}..." + RUST_BACKTRACE=1 "${NOISE}" -t < "${f}" > "${f}.out" + echo "updating." + cp "${f}.out" "${f}" + rm "${f}.out" +done + +echo "Updated tests. Use \`\`git diff ./repl-tests\`\` to review the changes." From 05c7bf4d5e64aa81053e656d078e39d884cbafe3 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sun, 15 Jan 2017 21:35:40 -0800 Subject: [PATCH 061/122] Move Returnables into their own files and added comments --- src/filters.rs | 3 +- src/lib.rs | 1 + src/parser.rs | 4 +- src/query.rs | 480 +------------------------------------------ src/returnable.rs | 513 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 522 insertions(+), 479 deletions(-) create mode 100644 src/returnable.rs diff --git a/src/filters.rs b/src/filters.rs index 728fadf..4f377c9 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -10,7 +10,8 @@ use std::io::Cursor; use error::Error; use key_builder::KeyBuilder; -use query::{DocResult, QueryScoringInfo, RetValue}; +use query::{DocResult, QueryScoringInfo}; +use returnable::RetValue; use json_value::JsonValue; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs diff --git a/src/lib.rs b/src/lib.rs index 97aaf60..a6709d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ mod json_shred; mod key_builder; mod parser; mod stems; +mod returnable; pub mod repl; pub mod json_value; pub mod index; diff --git a/src/parser.rs b/src/parser.rs index b5d1394..932a71f 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -9,8 +9,8 @@ use error::Error; use key_builder::KeyBuilder; use stems::Stems; use json_value::JsonValue; -use query::{Sort, Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore, - AggregateFun, SortInfo, SortField}; +use query::{Sort, AggregateFun, SortInfo, SortField}; +use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, NotFilter}; diff --git a/src/query.rs b/src/query.rs index ada6d5f..e0c5d70 100644 --- a/src/query.rs +++ b/src/query.rs @@ -2,21 +2,21 @@ use std::str; use std::cmp::Ordering; use std::collections::HashMap; -use std::iter::Peekable; -use std::mem::{transmute, swap}; +use std::mem::swap; use std::collections::VecDeque; use std::iter::Iterator; use std::usize; use error::Error; use index::Index; -use key_builder::{KeyBuilder, Segment}; +use key_builder::KeyBuilder; use parser::Parser; use json_value::{JsonValue}; use filters::QueryRuntimeFilter; +use returnable::{Returnable, RetValue, RetScore, RetHidden}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{self, DBIterator, IteratorMode, Snapshot}; +use rocksdb::{DBIterator, IteratorMode, Snapshot}; #[derive(Clone)] @@ -844,478 +844,6 @@ pub struct SortInfo { } -pub trait Returnable { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, - bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error>; - - fn get_aggregate_funs(&self, funs: &mut Vec>); - - fn get_sorting(&self, sorts: &mut Vec>); - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap); - - fn json_result(&self, results: &mut VecDeque) -> Result; -} - -pub struct RetObject { - pub fields: Vec<(String, Box)>, -} - -impl Returnable for RetObject { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, - bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { - for &(ref _key, ref field) in self.fields.iter() { - try!(field.fetch_result(iter, seq, score, bind_var_keys, result)); - } - Ok(()) - } - - fn get_aggregate_funs(&self, funs: &mut Vec>) { - for &(ref _key, ref field) in self.fields.iter() { - field.get_aggregate_funs(funs); - } - } - - fn get_sorting(&self, sorts: &mut Vec>) { - for &(ref _key, ref field) in self.fields.iter() { - field.get_sorting(sorts); - } - } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - for &mut (ref _key, ref mut field) in self.fields.iter_mut() { - field.take_sort_for_matching_fields(map); - } - } - - fn json_result(&self, results: &mut VecDeque) -> Result { - let mut vec = Vec::with_capacity(self.fields.len()); - for &(ref key, ref returnable) in self.fields.iter() { - vec.push((key.clone(), try!(returnable.json_result(results)))); - } - Ok(JsonValue::Object(vec)) - } -} - - -pub struct RetArray { - pub slots: Vec>, -} - -impl Returnable for RetArray { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, - bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { - for ref slot in self.slots.iter() { - try!(slot.fetch_result(iter, seq, score, bind_var_keys, result)); - } - Ok(()) - } - - fn get_aggregate_funs(&self, funs: &mut Vec>) { - for ref slot in self.slots.iter() { - slot.get_aggregate_funs(funs); - } - } - - fn get_sorting(&self, sorts: &mut Vec>) { - for ref slot in self.slots.iter() { - slot.get_sorting(sorts); - } - } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - for slot in self.slots.iter_mut() { - slot.take_sort_for_matching_fields(map); - } - } - - fn json_result(&self, results: &mut VecDeque) -> Result { - let mut vec = Vec::with_capacity(self.slots.len()); - for slot in self.slots.iter() { - vec.push(try!(slot.json_result(results))); - } - Ok(JsonValue::Array(vec)) - } -} - - - -pub struct RetHidden { - unrendered: Vec>, - visible: Box, -} - -impl Returnable for RetHidden { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, - bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { - for ref mut unrendered in self.unrendered.iter() { - try!(unrendered.fetch_result(iter, seq, score, bind_var_keys, result)); - } - - self.visible.fetch_result(iter, seq, score, bind_var_keys, result) - } - - fn get_aggregate_funs(&self, funs: &mut Vec>) { - self.visible.get_aggregate_funs(funs); - } - - fn get_sorting(&self, sorts: &mut Vec>) { - for ref mut unrendered in self.unrendered.iter() { - unrendered.get_sorting(sorts); - } - - self.visible.get_sorting(sorts); - } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.visible.take_sort_for_matching_fields(map); - } - - fn json_result(&self, results: &mut VecDeque) -> Result { - for _n in 0..self.unrendered.len() { - // we already sorted at this point, now discard the values - results.pop_front(); - } - self.visible.json_result(results) - } -} - -pub struct RetLiteral { - pub json: JsonValue, -} - -impl Returnable for RetLiteral { - fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, _score: f32, - _bind_var_keys: &HashMap>, - _result: &mut VecDeque) -> Result<(), Error> { - Ok(()) - } - - fn get_aggregate_funs(&self, _funs: &mut Vec>) { - //noop - } - - fn get_sorting(&self, _sorts: &mut Vec>) { - //noop - } - - fn take_sort_for_matching_fields(&mut self, _map: &mut HashMap) { - //noop - } - - fn json_result(&self, _results: &mut VecDeque) -> Result { - Ok(self.json.clone()) - } -} - -pub struct RetValue { - pub kb: KeyBuilder, - pub ag: Option<(AggregateFun, JsonValue)>, - pub default: JsonValue, - pub sort: Option, -} - -impl RetValue { - pub fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { - match bytes[0] as char { - 's' => { - let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); - JsonValue::String(string) - }, - 'f' => { - assert!(bytes.len() == 9); - let mut bytes2: [u8; 8] = [0; 8]; - for (n, b) in bytes[1..9].iter().enumerate() { - bytes2[n] = *b; - } - let double: f64 = unsafe{transmute(bytes2)}; - JsonValue::Number(double) - }, - 'T' => JsonValue::True, - 'F' => JsonValue::False, - 'N' => JsonValue::Null, - 'o' => JsonValue::Object(vec![]), - 'a' => JsonValue::Array(vec![]), - what => panic!("unexpected type tag in value: {}", what), - } - } - - fn return_array(mut array: Vec<(u64, JsonValue)>) -> Result { - array.sort_by_key(|tuple| tuple.0); - Ok(JsonValue::Array(array.into_iter() - .map(|(_i, json)| json) - .collect())) - } - - fn fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, - mut key: Box<[u8]>, mut value: Box<[u8]>) -> Result { - - if key.len() == value_key.len() { - // we have a key match! - return Ok(RetValue::bytes_to_json_value(value.as_ref())); - } - let segment = { - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let remaining = &key_str[value_key.len()..]; - KeyBuilder::parse_first_key_value_segment(&remaining) - }; - - match segment { - Some((Segment::ObjectKey(mut unescaped), escaped)) => { - let mut object: Vec<(String, JsonValue)> = Vec::new(); - - let mut value_key_next = value_key.to_string() + &escaped; - loop { - let json_val = try!(RetValue::fetch(iter, &value_key_next, key, value)); - object.push((unescaped, json_val)); - - let segment = match iter.peek() { - Some(&(ref k, ref _v)) => { - if !k.starts_with(value_key.as_bytes()) { - return Ok(JsonValue::Object(object)); - } - - let key_str = unsafe{str::from_utf8_unchecked(&k)}; - let remaining = &key_str[value_key.len()..]; - - KeyBuilder::parse_first_key_value_segment(&remaining) - }, - None => return Ok(JsonValue::Object(object)), - }; - - if let Some((Segment::ObjectKey(unescaped2), escaped2)) = segment { - unescaped = unescaped2; - // advance the peeked iter - match iter.next() { - Some((k, v)) => { - key = k; - value = v; - } - None => panic!("couldn't advanced already peeked iter"), - }; - value_key_next.truncate(value_key.len()); - value_key_next.push_str(&escaped2); - } else { - return Ok(JsonValue::Object(object)); - } - } - } - Some((Segment::Array(mut i), escaped)) => { - // we use a tuple with ordinal because we encounter - // elements in lexical sorting order instead of ordinal order - let mut array: Vec<(u64, JsonValue)> = Vec::new(); - - let mut value_key_next = value_key.to_string() + &escaped; - loop { - let json_val = try!(RetValue::fetch(iter, &value_key_next, - key, value)); - array.push((i, json_val)); - - let segment = match iter.peek() { - Some(&(ref k, ref _v)) => { - if !k.starts_with(value_key.as_bytes()) { - return RetValue::return_array(array); - } - - let key_str = unsafe{str::from_utf8_unchecked(&k)}; - let remaining = &key_str[value_key.len()..]; - - KeyBuilder::parse_first_key_value_segment(&remaining) - }, - None => return RetValue::return_array(array), - }; - - if let Some((Segment::Array(i2), escaped2)) = segment { - i = i2; - // advance the already peeked iter - match iter.next() { - Some((k, v)) => { - key = k; - value = v; - }, - None => panic!("couldn't advanced already peeked iter"), - }; - value_key_next.truncate(value_key.len()); - value_key_next.push_str(&escaped2); - } else { - return RetValue::return_array(array); - } - } - }, - None => { - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - panic!("somehow couldn't parse key segment {} {}", value_key, key_str); - }, - } - } -} - -impl Returnable for RetValue { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, - _bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { - if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { - //don't fetch anything for count(). just stick in a null - result.push_back(JsonValue::Null); - return Ok(()); - } - - let value_key = self.kb.value_key(seq); - - // Seek in index to >= entry - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - - let (key, value) = match iter.next() { - Some((key, value)) => (key, value), - None => { - result.push_back(self.default.clone()); - return Ok(()) - }, - }; - - if !key.starts_with(value_key.as_bytes()) { - result.push_back(self.default.clone()); - return Ok(()); - } - - let json_value = try!(RetValue::fetch(&mut iter.peekable(), &value_key, - key, value)); - result.push_back(json_value); - Ok(()) - } - - fn get_aggregate_funs(&self, funs: &mut Vec>) { - funs.push(self.ag.clone()); - } - - fn get_sorting(&self, sorts: &mut Vec>) { - sorts.push(self.sort.clone()); - } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - if let Some(sort_info) = map.remove(&self.kb.value_key(0)) { - self.sort = Some(sort_info.sort); - } - } - - fn json_result(&self, results: &mut VecDeque) -> Result { - if let Some(json) = results.pop_front() { - Ok(json) - } else { - panic!("missing result!"); - } - } -} - -pub struct RetBind { - pub bind_name: String, - pub extra_key: String, - pub ag: Option<(AggregateFun, JsonValue)>, - pub default: JsonValue, - pub sort: Option, -} - - -impl Returnable for RetBind { - fn fetch_result(&self, iter: &mut DBIterator, _seq: u64, _score: f32, - bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { - - if let Some(value_keys) = bind_var_keys.get(&self.bind_name) { - let mut array = Vec::with_capacity(value_keys.len()); - for base_key in value_keys { - // Seek in index to >= entry - let value_key = base_key.to_string() + &self.extra_key; - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - - let (key, value) = match iter.next() { - Some((key, value)) => (key, value), - None => { - result.push_back(self.default.clone()); - return Ok(()) - }, - }; - - if !key.starts_with(value_key.as_bytes()) { - array.push(self.default.clone()); - } else { - array.push(try!(RetValue::fetch(&mut iter.peekable(), &value_key, - key, value))); - } - } - result.push_back(JsonValue::Array(array)); - } else { - result.push_back(JsonValue::Array(vec![self.default.clone()])) - } - - Ok(()) - } - - fn get_aggregate_funs(&self, funs: &mut Vec>) { - funs.push(self.ag.clone()); - } - - fn get_sorting(&self, sorts: &mut Vec>) { - sorts.push(self.sort.clone()); - } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - if let Some(sort_info) = map.remove(&(self.bind_name.to_string() + &self.extra_key)) { - self.sort = Some(sort_info.sort); - } - } - - fn json_result(&self, results: &mut VecDeque) -> Result { - if let Some(json) = results.pop_front() { - Ok(json) - } else { - panic!("missing bind result!"); - } - } -} - - -pub struct RetScore { - pub sort: Option, -} - - -impl Returnable for RetScore { - fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, score: f32, - _bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { - result.push_back(JsonValue::Number(score as f64)); - Ok(()) - } - - fn get_aggregate_funs(&self, _funs: &mut Vec>) { - // noop - } - - fn get_sorting(&self, sorts: &mut Vec>) { - sorts.push(self.sort.clone()); - } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - if let Some(sort_info) = map.remove("score()") { - self.sort = Some(sort_info.sort); - } - } - - fn json_result(&self, results: &mut VecDeque) -> Result { - if let Some(json) = results.pop_front() { - Ok(json) - } else { - panic!("missing score result!"); - } - } -} #[cfg(test)] mod tests { diff --git a/src/returnable.rs b/src/returnable.rs new file mode 100644 index 0000000..02ffe16 --- /dev/null +++ b/src/returnable.rs @@ -0,0 +1,513 @@ + +use std::str; +use std::collections::HashMap; +use std::iter::Peekable; +use std::mem::transmute; +use std::collections::VecDeque; +use std::iter::Iterator; + +use error::Error; +use key_builder::{KeyBuilder, Segment}; +use json_value::{JsonValue}; +use query::{Sort, AggregateFun, SortInfo}; + +use rocksdb::{self, DBIterator, IteratorMode}; + + +/// Returnables are created from parsing the return statement in queries. +/// They nest inside of each other, with the outermost typically being a RetObject or RetArray. +pub trait Returnable { + /// When a match is found, information about the match is passed to outer most Returnable + /// and then each nested Returnable will fetch information about the document (fields or + /// scores or bind variables etc) and convert them to JsonValues and add them to the result + /// VecDeque. + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error>; + + /// If aggregates are used each Returnable needs to return information about the + /// aggregate function it's using and the default value. + fn get_aggregate_funs(&self, funs: &mut Vec>); + + /// If a query has a sort clause then we want to match the fields being sorted with + /// fields being returned. We pass the sorting info by the path of the sorted fields + /// or scores and Returnables that have the same path will take the sort + /// information. Any fields not matching a returnable are then added to special hidden + /// Returnable (RetHidden) which fetches those fields for sorting but not rendered or + /// returned. + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap); + + /// Each Returnable will return the sorting direction in the same slot as the returnable + /// so that later after fetching they will be sorted by QueryResults after fetching but + /// converting to the final json result. + fn get_sorting(&self, sorts: &mut Vec>); + + /// This is the final step of a Returnable. The previous fetched JsonValues are now + /// rendered with other ornamental json elements. + fn json_result(&self, results: &mut VecDeque) -> Result; +} + +/// A static Json Object the can contain another number of fields and nested returnables. +pub struct RetObject { + pub fields: Vec<(String, Box)>, +} + +impl Returnable for RetObject { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + for &(ref _key, ref field) in self.fields.iter() { + try!(field.fetch_result(iter, seq, score, bind_var_keys, result)); + } + Ok(()) + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + for &(ref _key, ref field) in self.fields.iter() { + field.get_aggregate_funs(funs); + } + } + + fn get_sorting(&self, sorts: &mut Vec>) { + for &(ref _key, ref field) in self.fields.iter() { + field.get_sorting(sorts); + } + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + for &mut (ref _key, ref mut field) in self.fields.iter_mut() { + field.take_sort_for_matching_fields(map); + } + } + + fn json_result(&self, results: &mut VecDeque) -> Result { + let mut vec = Vec::with_capacity(self.fields.len()); + for &(ref key, ref returnable) in self.fields.iter() { + vec.push((key.clone(), try!(returnable.json_result(results)))); + } + Ok(JsonValue::Object(vec)) + } +} + +/// A static Json array the can contain another number of nested Returnables. +pub struct RetArray { + pub slots: Vec>, +} + +impl Returnable for RetArray { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + for ref slot in self.slots.iter() { + try!(slot.fetch_result(iter, seq, score, bind_var_keys, result)); + } + Ok(()) + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + for ref slot in self.slots.iter() { + slot.get_aggregate_funs(funs); + } + } + + fn get_sorting(&self, sorts: &mut Vec>) { + for ref slot in self.slots.iter() { + slot.get_sorting(sorts); + } + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + for slot in self.slots.iter_mut() { + slot.take_sort_for_matching_fields(map); + } + } + + fn json_result(&self, results: &mut VecDeque) -> Result { + let mut vec = Vec::with_capacity(self.slots.len()); + for slot in self.slots.iter() { + vec.push(try!(slot.json_result(results))); + } + Ok(JsonValue::Array(vec)) + } +} + +/// A special returnable that only fetches values for later sorting but never renders +/// them back to the caller. +pub struct RetHidden { + pub unrendered: Vec>, + pub visible: Box, +} + +impl Returnable for RetHidden { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + for ref mut unrendered in self.unrendered.iter() { + try!(unrendered.fetch_result(iter, seq, score, bind_var_keys, result)); + } + + self.visible.fetch_result(iter, seq, score, bind_var_keys, result) + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + self.visible.get_aggregate_funs(funs); + } + + fn get_sorting(&self, sorts: &mut Vec>) { + for ref mut unrendered in self.unrendered.iter() { + unrendered.get_sorting(sorts); + } + + self.visible.get_sorting(sorts); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + self.visible.take_sort_for_matching_fields(map); + } + + fn json_result(&self, results: &mut VecDeque) -> Result { + for _n in 0..self.unrendered.len() { + // we already sorted at this point, now discard the values + results.pop_front(); + } + self.visible.json_result(results) + } +} + +/// A literal JsonValue. Number, String, Null, True or False. Just in case the query +/// wants to return something that doesn't come fro ma document. +pub struct RetLiteral { + pub json: JsonValue, +} + +impl Returnable for RetLiteral { + fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, _score: f32, + _bind_var_keys: &HashMap>, + _result: &mut VecDeque) -> Result<(), Error> { + Ok(()) + } + + fn get_aggregate_funs(&self, _funs: &mut Vec>) { + //noop + } + + fn get_sorting(&self, _sorts: &mut Vec>) { + //noop + } + + fn take_sort_for_matching_fields(&mut self, _map: &mut HashMap) { + //noop + } + + fn json_result(&self, _results: &mut VecDeque) -> Result { + Ok(self.json.clone()) + } +} + +/// A value from a document. It knows the path it wants to fetch and loads the value from the +/// stored original document. +pub struct RetValue { + pub kb: KeyBuilder, + pub ag: Option<(AggregateFun, JsonValue)>, + pub default: JsonValue, + pub sort: Option, +} + +impl RetValue { + pub fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { + match bytes[0] as char { + 's' => { + let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); + JsonValue::String(string) + }, + 'f' => { + assert!(bytes.len() == 9); + let mut bytes2: [u8; 8] = [0; 8]; + for (n, b) in bytes[1..9].iter().enumerate() { + bytes2[n] = *b; + } + let double: f64 = unsafe{transmute(bytes2)}; + JsonValue::Number(double) + }, + 'T' => JsonValue::True, + 'F' => JsonValue::False, + 'N' => JsonValue::Null, + 'o' => JsonValue::Object(vec![]), + 'a' => JsonValue::Array(vec![]), + what => panic!("unexpected type tag in value: {}", what), + } + } + + fn return_array(mut array: Vec<(u64, JsonValue)>) -> Result { + array.sort_by_key(|tuple| tuple.0); + Ok(JsonValue::Array(array.into_iter() + .map(|(_i, json)| json) + .collect())) + } + + fn fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, + mut key: Box<[u8]>, mut value: Box<[u8]>) -> Result { + + if key.len() == value_key.len() { + // we have a key match! + return Ok(RetValue::bytes_to_json_value(value.as_ref())); + } + let segment = { + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let remaining = &key_str[value_key.len()..]; + KeyBuilder::parse_first_key_value_segment(&remaining) + }; + + match segment { + Some((Segment::ObjectKey(mut unescaped), escaped)) => { + let mut object: Vec<(String, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = try!(RetValue::fetch(iter, &value_key_next, key, value)); + object.push((unescaped, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + if !k.starts_with(value_key.as_bytes()) { + return Ok(JsonValue::Object(object)); + } + + let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + }, + None => return Ok(JsonValue::Object(object)), + }; + + if let Some((Segment::ObjectKey(unescaped2), escaped2)) = segment { + unescaped = unescaped2; + // advance the peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + } + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return Ok(JsonValue::Object(object)); + } + } + } + Some((Segment::Array(mut i), escaped)) => { + // we use a tuple with ordinal because we encounter + // elements in lexical sorting order instead of ordinal order + let mut array: Vec<(u64, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = try!(RetValue::fetch(iter, &value_key_next, + key, value)); + array.push((i, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + if !k.starts_with(value_key.as_bytes()) { + return RetValue::return_array(array); + } + + let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + }, + None => return RetValue::return_array(array), + }; + + if let Some((Segment::Array(i2), escaped2)) = segment { + i = i2; + // advance the already peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + }, + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return RetValue::return_array(array); + } + } + }, + None => { + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + panic!("somehow couldn't parse key segment {} {}", value_key, key_str); + }, + } + } +} + +impl Returnable for RetValue { + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, + _bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { + //don't fetch anything for count(). just stick in a null + result.push_back(JsonValue::Null); + return Ok(()); + } + + let value_key = self.kb.value_key(seq); + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => { + result.push_back(self.default.clone()); + return Ok(()) + }, + }; + + if !key.starts_with(value_key.as_bytes()) { + result.push_back(self.default.clone()); + return Ok(()); + } + + let json_value = try!(RetValue::fetch(&mut iter.peekable(), &value_key, + key, value)); + result.push_back(json_value); + Ok(()) + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + funs.push(self.ag.clone()); + } + + fn get_sorting(&self, sorts: &mut Vec>) { + sorts.push(self.sort.clone()); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + if let Some(sort_info) = map.remove(&self.kb.value_key(0)) { + self.sort = Some(sort_info.sort); + } + } + + fn json_result(&self, results: &mut VecDeque) -> Result { + if let Some(json) = results.pop_front() { + Ok(json) + } else { + panic!("missing result!"); + } + } +} + +/// A bind variable. If a bind variable was matched it will be fetched then it's path is +/// added to the bind_var_keys passed into fetch_result(). This will load the values from the +/// original document and return it. +pub struct RetBind { + pub bind_name: String, + pub extra_key: String, + pub ag: Option<(AggregateFun, JsonValue)>, + pub default: JsonValue, + pub sort: Option, +} + +impl Returnable for RetBind { + fn fetch_result(&self, iter: &mut DBIterator, _seq: u64, _score: f32, + bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + + if let Some(value_keys) = bind_var_keys.get(&self.bind_name) { + let mut array = Vec::with_capacity(value_keys.len()); + for base_key in value_keys { + // Seek in index to >= entry + let value_key = base_key.to_string() + &self.extra_key; + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => { + result.push_back(self.default.clone()); + return Ok(()) + }, + }; + + if !key.starts_with(value_key.as_bytes()) { + array.push(self.default.clone()); + } else { + array.push(try!(RetValue::fetch(&mut iter.peekable(), &value_key, + key, value))); + } + } + result.push_back(JsonValue::Array(array)); + } else { + result.push_back(JsonValue::Array(vec![self.default.clone()])) + } + + Ok(()) + } + + fn get_aggregate_funs(&self, funs: &mut Vec>) { + funs.push(self.ag.clone()); + } + + fn get_sorting(&self, sorts: &mut Vec>) { + sorts.push(self.sort.clone()); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + if let Some(sort_info) = map.remove(&(self.bind_name.to_string() + &self.extra_key)) { + self.sort = Some(sort_info.sort); + } + } + + fn json_result(&self, results: &mut VecDeque) -> Result { + if let Some(json) = results.pop_front() { + Ok(json) + } else { + panic!("missing bind result!"); + } + } +} + +/// Returns a relevency score for a match. +pub struct RetScore { + pub sort: Option, +} + +impl Returnable for RetScore { + fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, score: f32, + _bind_var_keys: &HashMap>, + result: &mut VecDeque) -> Result<(), Error> { + result.push_back(JsonValue::Number(score as f64)); + Ok(()) + } + + fn get_aggregate_funs(&self, _funs: &mut Vec>) { + // noop + } + + fn get_sorting(&self, sorts: &mut Vec>) { + sorts.push(self.sort.clone()); + } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + if let Some(sort_info) = map.remove("score()") { + self.sort = Some(sort_info.sort); + } + } + + fn json_result(&self, results: &mut VecDeque) -> Result { + if let Some(json) = results.pop_front() { + Ok(json) + } else { + panic!("missing score result!"); + } + } +} \ No newline at end of file From 5aedfcce074ef85e6cb9c7a6bde38f723d2eb8fd Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sun, 15 Jan 2017 22:32:49 -0800 Subject: [PATCH 062/122] Preserve ordering of sort clause If the sort clause fields are ordered differently than the return values it would use the order of the returned field instead of the order in the sort clause. Fixed. --- repl-tests/collation.noise | 16 +++++++ src/parser.rs | 18 +++---- src/query.rs | 23 +++++---- src/returnable.rs | 96 ++++++++++++++++++-------------------- 4 files changed, 86 insertions(+), 67 deletions(-) diff --git a/repl-tests/collation.noise b/repl-tests/collation.noise index 87c0f4e..c036969 100644 --- a/repl-tests/collation.noise +++ b/repl-tests/collation.noise @@ -118,6 +118,22 @@ return [.bar[0], .bar[1], .bar[2]] ; [5,4,3] ] +find {foo: =="coll2"} +sort .bar[0] asc, .bar[1] desc, .bar[2] desc +return [.bar[2], .bar[1], .bar[0]] ; +[ +[2,2,1], +[2,1,1], +[1,1,1], +[2,3,2], +[2,2,2], +[2,1,2], +[1,1,2], +[3,4,3], +[5,5,5], +[3,4,5] +] + find {foo: =="group2"} sort .baz asc, .bar desc return [.baz, .bar] diff --git a/src/parser.rs b/src/parser.rs index 932a71f..1973ffd 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -774,6 +774,7 @@ impl<'a, 'c> Parser<'a, 'c> { pub fn sort_clause(&mut self) -> Result, Error> { let mut sort_infos = HashMap::new(); if self.consume("sort") { + let mut n = 0; loop { if let Some(kb) = try!(self.consume_keypath()) { // doing the search for source 2x so user can order @@ -806,7 +807,7 @@ impl<'a, 'c> Parser<'a, 'c> { }; sort_infos.insert(kb.value_key(0), SortInfo{field: SortField::FetchValue(kb), - sort: sort, + sort: sort, order_to_apply: n, default: default}); } else { try!(self.must_consume("score")); @@ -824,13 +825,14 @@ impl<'a, 'c> Parser<'a, 'c> { }; sort_infos.insert("score()".to_string(), - SortInfo{field: SortField::Score, + SortInfo{field: SortField::Score, order_to_apply: n, sort: sort, default: JsonValue::Null}); } if !self.consume(",") { break; } + n += 1; } if sort_infos.is_empty() { return Err(Error::Parse("Expected field path in sort expression.".to_string())); @@ -849,7 +851,7 @@ impl<'a, 'c> Parser<'a, 'c> { } else { let mut kb = KeyBuilder::new(); kb.push_object_key("_id"); - Ok(Box::new(RetValue{kb: kb, ag:None, default: JsonValue::Null, sort: None})) + Ok(Box::new(RetValue{kb: kb, ag:None, default: JsonValue::Null, sort_info: None})) } } @@ -907,7 +909,7 @@ impl<'a, 'c> Parser<'a, 'c> { if self.consume("(") { try!(self.must_consume(")")); self.needs_scoring = true; - return Ok(Some(Box::new(RetScore{sort: None}))); + return Ok(Some(Box::new(RetScore{sort_info: None}))); } else { //wasn't the score, maybe it's a bind variable self.offset = offset; @@ -923,10 +925,10 @@ impl<'a, 'c> Parser<'a, 'c> { if let Some(bind_name) = bind_name_option { let extra_key = kb.value_key_path_only(); Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_key: extra_key, - ag: Some((ag, json)), default: default, sort:None}))) + ag: Some((ag, json)), default: default, sort_info:None}))) } else { Ok(Some(Box::new(RetValue{kb: kb, ag: Some((ag, json)), - default: default, sort:None}))) + default: default, sort_info:None}))) } } else if let Some(bind_name) = self.consume_field() { let extra_key = if let Some(kb) = try!(self.consume_keypath()) { @@ -942,7 +944,7 @@ impl<'a, 'c> Parser<'a, 'c> { }; Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_key: extra_key, - ag: None, default: default, sort:None}))) + ag: None, default: default, sort_info:None}))) } else if let Some(kb) = try!(self.consume_keypath()) { let default = if let Some(default) = try!(self.consume_default()) { default @@ -950,7 +952,7 @@ impl<'a, 'c> Parser<'a, 'c> { JsonValue::Null }; - Ok(Some(Box::new(RetValue{kb: kb, ag: None, default: default, sort: None}))) + Ok(Some(Box::new(RetValue{kb: kb, ag: None, default: default, sort_info: None}))) } else if self.could_consume("{") { Ok(Some(try!(self.ret_object()))) } else if self.could_consume("[") { diff --git a/src/query.rs b/src/query.rs index e0c5d70..9b8482c 100644 --- a/src/query.rs +++ b/src/query.rs @@ -198,15 +198,16 @@ impl Query { if !sorts.is_empty() { let mut vec: Vec> = Vec::new(); for (_key, sort_info) in sorts.into_iter() { + let sort = sort_info.clone(); match sort_info.field { SortField::FetchValue(kb) => { vec.push(Box::new(RetValue{ kb: kb, ag: None, default: sort_info.default, - sort: Some(sort_info.sort)})); + sort_info: Some(sort)})); }, SortField::Score => { - vec.push(Box::new(RetScore{ sort: Some(sort_info.sort)})); + vec.push(Box::new(RetScore{ sort_info: Some(sort)})); }, } } @@ -236,20 +237,23 @@ impl Query { // this way we don't needlesss loop over the actions where most are noops - let mut sorts = Vec::new(); - if has_sorting { + let mut sorts = if has_sorting { + let mut sorts = Vec::new(); let mut sorting = Vec::new(); returnable.get_sorting(&mut sorting); let mut n = sorting.len(); while let Some(option) = sorting.pop() { n -= 1; - if let Some(sort_dir) = option { - sorts.push((sort_dir, n)); + if let Some(sort_info) = option { + sorts.push((sort_info, n)); } } // order we process sorts is important - sorts.reverse(); - } + sorts.sort_by_key(|&(ref sort_info, ref _n)| sort_info.order_to_apply); + sorts.into_iter().map(|(sort_info, n)| (sort_info.sort, n)).collect() + } else { + Vec::new() + }; let mut does_group_or_aggr = false; @@ -832,13 +836,16 @@ pub enum Sort { Desc, } +#[derive(Clone)] pub enum SortField { FetchValue(KeyBuilder), Score, } +#[derive(Clone)] pub struct SortInfo { pub field: SortField, + pub order_to_apply: usize, pub sort: Sort, pub default: JsonValue, } diff --git a/src/returnable.rs b/src/returnable.rs index 02ffe16..6873c4f 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -9,7 +9,7 @@ use std::iter::Iterator; use error::Error; use key_builder::{KeyBuilder, Segment}; use json_value::{JsonValue}; -use query::{Sort, AggregateFun, SortInfo}; +use query::{AggregateFun, SortInfo}; use rocksdb::{self, DBIterator, IteratorMode}; @@ -40,7 +40,7 @@ pub trait Returnable { /// Each Returnable will return the sorting direction in the same slot as the returnable /// so that later after fetching they will be sorted by QueryResults after fetching but /// converting to the final json result. - fn get_sorting(&self, sorts: &mut Vec>); + fn get_sorting(&mut self, sorts: &mut Vec>); /// This is the final step of a Returnable. The previous fetched JsonValues are now /// rendered with other ornamental json elements. @@ -67,12 +67,6 @@ impl Returnable for RetObject { field.get_aggregate_funs(funs); } } - - fn get_sorting(&self, sorts: &mut Vec>) { - for &(ref _key, ref field) in self.fields.iter() { - field.get_sorting(sorts); - } - } fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { for &mut (ref _key, ref mut field) in self.fields.iter_mut() { @@ -80,6 +74,12 @@ impl Returnable for RetObject { } } + fn get_sorting(&mut self, sorts: &mut Vec>) { + for &mut (ref mut _key, ref mut field) in self.fields.iter_mut() { + field.get_sorting(sorts); + } + } + fn json_result(&self, results: &mut VecDeque) -> Result { let mut vec = Vec::with_capacity(self.fields.len()); for &(ref key, ref returnable) in self.fields.iter() { @@ -109,12 +109,6 @@ impl Returnable for RetArray { slot.get_aggregate_funs(funs); } } - - fn get_sorting(&self, sorts: &mut Vec>) { - for ref slot in self.slots.iter() { - slot.get_sorting(sorts); - } - } fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { for slot in self.slots.iter_mut() { @@ -122,6 +116,12 @@ impl Returnable for RetArray { } } + fn get_sorting(&mut self, sorts: &mut Vec>) { + for ref mut slot in self.slots.iter_mut() { + slot.get_sorting(sorts); + } + } + fn json_result(&self, results: &mut VecDeque) -> Result { let mut vec = Vec::with_capacity(self.slots.len()); for slot in self.slots.iter() { @@ -142,7 +142,7 @@ impl Returnable for RetHidden { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { - for ref mut unrendered in self.unrendered.iter() { + for ref unrendered in self.unrendered.iter() { try!(unrendered.fetch_result(iter, seq, score, bind_var_keys, result)); } @@ -152,18 +152,18 @@ impl Returnable for RetHidden { fn get_aggregate_funs(&self, funs: &mut Vec>) { self.visible.get_aggregate_funs(funs); } + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + self.visible.take_sort_for_matching_fields(map); + } - fn get_sorting(&self, sorts: &mut Vec>) { - for ref mut unrendered in self.unrendered.iter() { + fn get_sorting(&mut self, sorts: &mut Vec>) { + for ref mut unrendered in self.unrendered.iter_mut() { unrendered.get_sorting(sorts); } self.visible.get_sorting(sorts); } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.visible.take_sort_for_matching_fields(map); - } fn json_result(&self, results: &mut VecDeque) -> Result { for _n in 0..self.unrendered.len() { @@ -175,7 +175,7 @@ impl Returnable for RetHidden { } /// A literal JsonValue. Number, String, Null, True or False. Just in case the query -/// wants to return something that doesn't come fro ma document. +/// wants to return something that doesn't come from a document. pub struct RetLiteral { pub json: JsonValue, } @@ -190,15 +190,15 @@ impl Returnable for RetLiteral { fn get_aggregate_funs(&self, _funs: &mut Vec>) { //noop } - - fn get_sorting(&self, _sorts: &mut Vec>) { - //noop - } fn take_sort_for_matching_fields(&mut self, _map: &mut HashMap) { //noop } + fn get_sorting(&mut self, _sorts: &mut Vec>) { + //noop + } + fn json_result(&self, _results: &mut VecDeque) -> Result { Ok(self.json.clone()) } @@ -210,7 +210,7 @@ pub struct RetValue { pub kb: KeyBuilder, pub ag: Option<(AggregateFun, JsonValue)>, pub default: JsonValue, - pub sort: Option, + pub sort_info: Option, } impl RetValue { @@ -386,15 +386,13 @@ impl Returnable for RetValue { fn get_aggregate_funs(&self, funs: &mut Vec>) { funs.push(self.ag.clone()); } - - fn get_sorting(&self, sorts: &mut Vec>) { - sorts.push(self.sort.clone()); - } fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - if let Some(sort_info) = map.remove(&self.kb.value_key(0)) { - self.sort = Some(sort_info.sort); - } + self.sort_info = map.remove(&self.kb.value_key(0)); + } + + fn get_sorting(&mut self, sorts: &mut Vec>) { + sorts.push(self.sort_info.take()); } fn json_result(&self, results: &mut VecDeque) -> Result { @@ -414,7 +412,7 @@ pub struct RetBind { pub extra_key: String, pub ag: Option<(AggregateFun, JsonValue)>, pub default: JsonValue, - pub sort: Option, + pub sort_info: Option, } impl Returnable for RetBind { @@ -456,15 +454,13 @@ impl Returnable for RetBind { fn get_aggregate_funs(&self, funs: &mut Vec>) { funs.push(self.ag.clone()); } - - fn get_sorting(&self, sorts: &mut Vec>) { - sorts.push(self.sort.clone()); - } fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - if let Some(sort_info) = map.remove(&(self.bind_name.to_string() + &self.extra_key)) { - self.sort = Some(sort_info.sort); - } + self.sort_info = map.remove(&(self.bind_name.to_string() + &self.extra_key)); + } + + fn get_sorting(&mut self, sorts: &mut Vec>) { + sorts.push(self.sort_info.take()); } fn json_result(&self, results: &mut VecDeque) -> Result { @@ -478,7 +474,7 @@ impl Returnable for RetBind { /// Returns a relevency score for a match. pub struct RetScore { - pub sort: Option, + pub sort_info: Option, } impl Returnable for RetScore { @@ -492,15 +488,13 @@ impl Returnable for RetScore { fn get_aggregate_funs(&self, _funs: &mut Vec>) { // noop } - - fn get_sorting(&self, sorts: &mut Vec>) { - sorts.push(self.sort.clone()); - } fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - if let Some(sort_info) = map.remove("score()") { - self.sort = Some(sort_info.sort); - } + self.sort_info = map.remove("score()"); + } + + fn get_sorting(&mut self, sorts: &mut Vec>) { + sorts.push(self.sort_info.take()); } fn json_result(&self, results: &mut VecDeque) -> Result { @@ -510,4 +504,4 @@ impl Returnable for RetScore { panic!("missing score result!"); } } -} \ No newline at end of file +} From 27ab79b67e641406c9199be86658ffde3fa78de7 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 16 Jan 2017 16:03:27 -0800 Subject: [PATCH 063/122] Removed code that ignores fields starting with underscore. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It didn’t work. Also fixed a bug where an spurious empty object would be written to the stored json when a object has no text field in it. Added a test for it. --- repl-tests/query_basic.noise | 8 ++ src/json_shred.rs | 182 +++++++++++++---------------------- tests/repl_tests.rs | 3 + 3 files changed, 77 insertions(+), 116 deletions(-) diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise index 648ddcc..bd327c3 100644 --- a/repl-tests/query_basic.noise +++ b/repl-tests/query_basic.noise @@ -30,6 +30,8 @@ add {"_id":"12", "A":["1","2","3","4","5","6","7","8","9","10","11","12"]}; "12" add {"_id":"13", "A":["foo",1,true,false,null,{},[]]}; "13" +add {"_id":"14", "A":{"B":true}}; +"14" # Exact match object fields in arrays @@ -225,3 +227,9 @@ return {"a":"a","b":1.123,"true":true,"false":false,"null":null,array:[],object: [ {"a":"a","b":1.123,"true":true,"false":false,"null":null,"array":[],"object":{}} ] + +find {_id: =="14"} return .; +[ +{"A":{"B":true},"_id":"14"} +] + diff --git a/src/json_shred.rs b/src/json_shred.rs index ded1c0a..c64ac2b 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -27,23 +27,15 @@ enum ObjectKeyTypes { Id, /// Normal key Key(String), - /// Reserved key starting with underscore - Ignore, /// No key found NoKey, } -pub trait Indexable { - -} - - #[derive(Debug)] pub struct Shredder { kb: KeyBuilder, - // Top-level fields prefixed with an underscore are ignored - ignore_children: usize, doc_id: String, + object_keys_indexed: Vec } @@ -51,8 +43,8 @@ impl Shredder { pub fn new() -> Shredder { Shredder{ kb: KeyBuilder::new(), - ignore_children: 0, doc_id: String::new(), + object_keys_indexed: Vec::new(), } } @@ -110,28 +102,23 @@ impl Shredder { fn maybe_add_value(&mut self, parser: &Parser, code: char, value: &[u8], docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { - if self.ignore_children == 0 { - match self.extract_key(parser.stack().top()) { - ObjectKeyTypes::Id => { - return Err(Error::Shred( - "Expected string for `_id` field, got another type".to_string())); - }, - ObjectKeyTypes::Key(key) => { - // Pop the dummy object that makes ObjectEnd happy - // or the previous object key - self.kb.pop_object_key(); - self.kb.push_object_key(&key); - - try!(self.add_value(code, &value, docseq, batch)); - }, - ObjectKeyTypes::NoKey => { - try!(self.add_value(code, &value, docseq, batch)); - self.kb.inc_top_array_offset(); - }, - ObjectKeyTypes::Ignore => { - self.ignore_children = 1; - }, - } + match self.extract_key(parser.stack().top()) { + ObjectKeyTypes::Id => { + return Err(Error::Shred( + "Expected string for `_id` field, got another type".to_string())); + }, + ObjectKeyTypes::Key(key) => { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.kb.pop_object_key(); + self.kb.push_object_key(&key); + *self.object_keys_indexed.last_mut().unwrap() = true; + try!(self.add_value(code, &value, docseq, batch)); + }, + ObjectKeyTypes::NoKey => { + try!(self.add_value(code, &value, docseq, batch)); + self.kb.inc_top_array_offset(); + }, } Ok(()) } @@ -139,12 +126,8 @@ impl Shredder { fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { match stack_element { Some(StackElement::Key(key)) => { - if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { - if key == "_id" { - ObjectKeyTypes::Id - } else { - ObjectKeyTypes::Ignore - } + if self.kb.keypath_segments_len() == 1 && key == "_id" { + ObjectKeyTypes::Id } else { ObjectKeyTypes::Key(key.to_string()) } @@ -157,13 +140,9 @@ impl Shredder { // Don't push them if they are reserved fields (starting with underscore) fn maybe_push_key(&mut self, stack_element: Option) -> Result<(), Error> { if let Some(StackElement::Key(key)) = stack_element { - if self.kb.keypath_segments_len() == 1 && key.starts_with("_") { - if key == "_id" { - return Err(Error::Shred( + if self.kb.keypath_segments_len() == 1 && key == "_id" { + return Err(Error::Shred( "Expected string for `_id` field, got another type".to_string())); - } else { - self.ignore_children = 1; - } } else { // Pop the dummy object that makes ObjectEnd happy // or the previous object key @@ -177,91 +156,63 @@ impl Shredder { pub fn shred(&mut self, json: &str, docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result { let mut parser = Parser::new(json.chars()); - let mut token = parser.next(); - - // this will keep track of objects where encountered keys. - // if we didn't encounter keys then the top most element will be false. - let mut object_keys_indexed = Vec::new(); loop { // Get the next token, so that in case of an `ObjectStart` the key is already // on the stack. - match token.take() { + match parser.next().take() { Some(JsonEvent::ObjectStart) => { - if self.ignore_children > 0 { - self.ignore_children += 1; - } - else { - try!(self.maybe_push_key(parser.stack().top())); - // Just push something to make `ObjectEnd` happy - self.kb.push_object_key(""); - object_keys_indexed.push(false); - } + try!(self.maybe_push_key(parser.stack().top())); + // Just push something to make `ObjectEnd` happy + self.kb.push_object_key(""); + self.object_keys_indexed.push(false); }, Some(JsonEvent::ObjectEnd) => { - if self.ignore_children > 0 { - self.ignore_children -= 1; - } else { - self.kb.pop_object_key(); - if !object_keys_indexed.pop().unwrap() { - // this means we never wrote a key because the object was empty. - // So preserve the empty object by writing a special value. - try!(self.maybe_add_value(&parser, 'o', &[], docseq, batch)); - } - self.kb.inc_top_array_offset(); + self.kb.pop_object_key(); + if !self.object_keys_indexed.pop().unwrap() { + // this means we never wrote a key because the object was empty. + // So preserve the empty object by writing a special value. + try!(self.maybe_add_value(&parser, 'o', &[], docseq, batch)); } + self.kb.inc_top_array_offset(); }, Some(JsonEvent::ArrayStart) => { - if self.ignore_children > 0 { - self.ignore_children += 1; - } else { - try!(self.maybe_push_key(parser.stack().top())); - self.kb.push_array(); - } + try!(self.maybe_push_key(parser.stack().top())); + self.kb.push_array(); }, Some(JsonEvent::ArrayEnd) => { - if self.ignore_children > 0 { - self.ignore_children -= 1; + if self.kb.peek_array_offset() == 0 { + // this means we never wrote a value because the object was empty. + // So preserve the empty array by writing a special value. + self.kb.pop_array(); + try!(self.maybe_add_value(&parser, 'a', &[], docseq, batch)); } else { - if self.kb.peek_array_offset() == 0 { - // this means we never wrote a value because the object was empty. - // So preserve the empty array by writing a special value. - self.kb.pop_array(); - try!(self.maybe_add_value(&parser, 'a', &[], docseq, batch)); - } else { - self.kb.pop_array(); - } - self.kb.inc_top_array_offset(); + self.kb.pop_array(); } + self.kb.inc_top_array_offset(); }, Some(JsonEvent::StringValue(value)) => { - // No children to ignore - if self.ignore_children == 0 { - match self.extract_key(parser.stack().top()) { - ObjectKeyTypes::Id => { - self.doc_id = value.clone(); - self.kb.pop_object_key(); - self.kb.push_object_key("_id"); - *object_keys_indexed.last_mut().unwrap() = true; - - try!(self.add_entries(&value, docseq, batch)); - }, - ObjectKeyTypes::Key(key) => { - // Pop the dummy object that makes ObjectEnd happy - // or the previous object key - self.kb.pop_object_key(); - self.kb.push_object_key(&key); - *object_keys_indexed.last_mut().unwrap() = true; - - try!(self.add_entries(&value, docseq, batch)); - }, - ObjectKeyTypes::NoKey => { - try!(self.add_entries(&value, docseq, batch)); - self.kb.inc_top_array_offset(); - }, - ObjectKeyTypes::Ignore => { - self.ignore_children = 1; - }, - } + match self.extract_key(parser.stack().top()) { + ObjectKeyTypes::Id => { + self.doc_id = value.clone(); + self.kb.pop_object_key(); + self.kb.push_object_key("_id"); + *self.object_keys_indexed.last_mut().unwrap() = true; + + try!(self.add_entries(&value, docseq, batch)); + }, + ObjectKeyTypes::Key(key) => { + // Pop the dummy object that makes ObjectEnd happy + // or the previous object key + self.kb.pop_object_key(); + self.kb.push_object_key(&key); + *self.object_keys_indexed.last_mut().unwrap() = true; + + try!(self.add_entries(&value, docseq, batch)); + }, + ObjectKeyTypes::NoKey => { + try!(self.add_entries(&value, docseq, batch)); + self.kb.inc_top_array_offset(); + }, } }, Some(JsonEvent::BooleanValue(tf)) => { @@ -292,8 +243,6 @@ impl Shredder { break; } }; - - token = parser.next(); } Ok(self.doc_id.clone()) } @@ -411,6 +360,7 @@ mod tests { rocks.write(batch).unwrap(); let result = positions_from_rocks(&rocks); + assert!(result.is_empty()); } } diff --git a/tests/repl_tests.rs b/tests/repl_tests.rs index bd85d7e..a2bdbae 100644 --- a/tests/repl_tests.rs +++ b/tests/repl_tests.rs @@ -51,6 +51,9 @@ fn test_repl() { println!("{} successful", path.file_name().unwrap().to_str().unwrap().to_string()); } } + if total == 0 { + panic!("No tests were run!"); + } if failures > 0 { panic!("Failed {} tests in repl-test out of {}", failures, total); } From a0b0e26a3676b8bbc0552b80d91515e59e6fcefa Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 17 Jan 2017 12:13:34 -0800 Subject: [PATCH 064/122] delete and update docs, missing _id support Added command to delete a document, and commit command and update documents now works and removes all keys associated. When updating document, only fields that change will need to be written and indexed. Added compaction filter to remove merge fields once they go to zero. Updated rocksdb and had to change some code in sort functions because of it. --- .gitignore | 2 +- Cargo.toml | 3 +- repl-tests/deletion_updates.noise | 53 +++++++ repl-tests/group.noise | 36 ++--- src/filters.rs | 6 +- src/index.rs | 219 +++++++++++++++++++------- src/json_shred.rs | 252 +++++++++++++++++++++--------- src/key_builder.rs | 65 ++++++-- src/parser.rs | 4 +- src/query.rs | 4 +- src/repl.rs | 15 +- tests/repl_tests.rs | 4 +- 12 files changed, 483 insertions(+), 180 deletions(-) create mode 100644 repl-tests/deletion_updates.noise diff --git a/.gitignore b/.gitignore index aacb5b4..bc5c4e7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,6 @@ Cargo.lock **/*.iml .idea/ - +*.reject .DS_Store diff --git a/Cargo.toml b/Cargo.toml index 911ce70..24c2e0a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ rustc-serialize = "0.3.19" stemmer = "0.3.2" unicode-normalization = "0.1.2" unicode-segmentation = "0.1.2" -rocksdb = "0.5.0" +rocksdb = "0.6.0" varint = "0.9.0" +uuid = { version = "0.3", features = ["v4"] } diff --git a/repl-tests/deletion_updates.noise b/repl-tests/deletion_updates.noise new file mode 100644 index 0000000..5f2f9c2 --- /dev/null +++ b/repl-tests/deletion_updates.noise @@ -0,0 +1,53 @@ +# update and deletion tests + +# add before opening +add {"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}; +Write error: Index isn't open. + +drop target/tests/updatedeletion; +create target/tests/updatedeletion; + +add {"_id":"1", "A":[{"B":"B2","C":"C2"},{"B": "b1","C":"C2"}]}; +"1" +add {"_id":"2", "A":[{"B":"B2","C":[{"D":"D"}]},{"B": "b1","C":"C2"}]}; +"2" +add {"_id":"3", "A":"Multi word sentence"}; +"3" +add {"_id":"4", "A":"%&%}{}@);€"}; +"4" +add {"_id":"5", "A":"word"}; +"5" + +# delete before committing +del 5; +Write error: Attempt to delete doc with same _id added earlier + +# delete what doesn't exist +del 6; +not found + +commit; + +del 5; +ok + +add {"_id":"5", "A":"word"}; +"5" + +# add again without committing +add {"_id":"5", "A":"word"}; +Write error: Attempt to insert multiple docs with same _id + +commit; + +# add existing document +add {"_id":"5", "A":"wassup!"}; +"5" + +find {A: == "wassup!"}; +[ +"5" +] + +find {A: == "word"}; +[] diff --git a/repl-tests/group.noise b/repl-tests/group.noise index 22a2ab4..dbfc952 100644 --- a/repl-tests/group.noise +++ b/repl-tests/group.noise @@ -90,24 +90,24 @@ return {max: max(.baz)}; {"max":"c"} ] -add {"_id":"1", "foo":"group2", "baz": "a", "bar": "a"}; -"1" -add {"_id":"2", "foo":"group2", "baz": "a", "bar": "b"}; -"2" -add {"_id":"3", "foo":"group2", "baz": "b", "bar": "a"}; -"3" -add {"_id":"4", "foo":"group2", "baz": "b", "bar": "b"}; -"4" -add {"_id":"5", "foo":"group2", "baz": "a", "bar": "a"}; -"5" -add {"_id":"6", "foo":"group2", "baz": "a", "bar": "c"}; -"6" -add {"_id":"7", "foo":"group2", "baz": "b", "bar": "d"}; -"7" -add {"_id":"8", "foo":"group2", "baz": "b", "bar": "e"}; -"8" -add {"_id":"9", "foo":"group2", "baz": "a", "bar": "f"}; -"9" +add {"_id":"10", "foo":"group2", "baz": "a", "bar": "a"}; +"10" +add {"_id":"11", "foo":"group2", "baz": "a", "bar": "b"}; +"11" +add {"_id":"12", "foo":"group2", "baz": "b", "bar": "a"}; +"12" +add {"_id":"13", "foo":"group2", "baz": "b", "bar": "b"}; +"13" +add {"_id":"14", "foo":"group2", "baz": "a", "bar": "a"}; +"14" +add {"_id":"15", "foo":"group2", "baz": "a", "bar": "c"}; +"15" +add {"_id":"16", "foo":"group2", "baz": "b", "bar": "d"}; +"16" +add {"_id":"17", "foo":"group2", "baz": "b", "bar": "e"}; +"17" +add {"_id":"18", "foo":"group2", "baz": "a", "bar": "f"}; +"18" find {foo: =="group2"} return [group(.baz order=asc), group(.bar order=desc), count()]; diff --git a/src/filters.rs b/src/filters.rs index 4f377c9..2a806c7 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -42,14 +42,14 @@ impl Scorer { fn init(&mut self, qsi: &mut QueryScoringInfo) { let key = self.kb.keypathword_count_key(&self.word); let doc_freq = if let Some(bytes) = self.get_value(&key) { - Index::convert_bytes_to_u32(bytes.as_ref()) as f32 + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 } else { 0.0 }; let key = self.kb.keypath_count_key(); let num_docs = if let Some(bytes) = self.get_value(&key) { - Index::convert_bytes_to_u32(bytes.as_ref()) as f32 + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 } else { 0.0 }; @@ -77,7 +77,7 @@ impl Scorer { if self.should_score() { let key = self.kb.field_length_key_from_doc_result(dr); let total_field_words = if let Some(bytes) = self.get_value(&key) { - Index::convert_bytes_to_u32(bytes.as_ref()) as f32 + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 } else { panic!("Couldn't find field length for a match!! WHAT!"); }; diff --git a/src/index.rs b/src/index.rs index 2c1de5c..6ef57e4 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,18 +1,22 @@ extern crate rocksdb; extern crate varint; +extern crate uuid; -use std::collections::HashMap; +use std::collections::{HashSet, BTreeMap}; use std::str; use std::io::Cursor; use std::mem; use std::io::Write; +use self::uuid::{Uuid, UuidVersion}; +use std::cmp::Ordering; use self::varint::{VarintRead, VarintWrite}; -use rocksdb::MergeOperands; +use rocksdb::{MergeOperands, IteratorMode, CompactionDecision}; use error::Error; use json_shred::{Shredder}; +use key_builder::KeyBuilder; const NOISE_HEADER_VERSION: u64 = 1; @@ -20,7 +24,7 @@ pub struct Index { write_options: rocksdb::WriteOptions, high_doc_seq: u64, pub rocks: Option, - id_str_to_id_seq: HashMap, + id_str_in_batch: HashSet, batch: Option, } @@ -34,7 +38,7 @@ impl Index { write_options: rocksdb::WriteOptions::new(), high_doc_seq: 0, rocks: None, - id_str_to_id_seq: HashMap::new(), + id_str_in_batch: HashSet::new(), batch: None, } } @@ -44,8 +48,9 @@ impl Index { //fn open(&mut self, name: &str, open_options: Option) -> Result { pub fn open(&mut self, name: &str, open_options: Option) -> Result<(), Error> { let mut rocks_options = rocksdb::Options::default(); - rocks_options.set_comparator("noise", Index::compare_keys); - rocks_options.set_merge_operator("noise", Index::sum_merge); + rocks_options.set_comparator("noise_cmp", Index::compare_keys); + rocks_options.set_merge_operator("noise_merge", Index::sum_merge); + rocks_options.set_compaction_filter("noise_compact", Index::compaction_filter); let rocks = match rocksdb::DB::open(&rocks_options, name) { Ok(rocks) => rocks, @@ -82,9 +87,8 @@ impl Index { Ok(()) } - // NOTE vmx 2016-10-13: As one index is tied to one database, this should be a method - // without a parameter - pub fn delete(name: &str) -> Result<(), Error> { + //This deletes the Rockdbs instance from disk + pub fn drop(name: &str) -> Result<(), Error> { let ret = try!(rocksdb::DB::destroy(&rocksdb::Options::default(), name)); Ok(ret) } @@ -94,41 +98,93 @@ impl Index { return Err(Error::Write("Index isn't open.".to_string())); } let mut shredder = Shredder::new(); - - let docid = try!(shredder.shred(json, self.high_doc_seq + 1, - self.batch.as_mut().unwrap())); - if self.id_str_to_id_seq.contains_key(&docid) { - return Err(Error::Write("Attempt to insert multiple docs with same _id" - .to_string())); - } - self.high_doc_seq += 1; - self.id_str_to_id_seq.insert(format!("I{}", docid), format!("{}", self.high_doc_seq)); + let (seq, docid) = if let Some(docid) = try!(shredder.shred(json)) { + // user supplied doc id, see if we have an existing one. + if self.id_str_in_batch.contains(&docid) { + // oops use trying to add some doc 2x to this batch. + return Err(Error::Write("Attempt to insert multiple docs with same _id" + .to_string())); + } + if let Some((seq, existing_key_values)) = try!(self.gather_doc_fields(&docid)) { + shredder.merge_existing_doc(existing_key_values); + (seq, docid) + } else { + // no existing document found, so we use the one supplied. + self.high_doc_seq += 1; + (self.high_doc_seq, docid) + } + } else { + // no doc id supplied in document, so we create one. + let docid = Uuid::new(UuidVersion::Random).unwrap().simple().to_string(); + try!(shredder.add_id(&docid)); + self.high_doc_seq += 1; + (self.high_doc_seq, docid) + }; + // now everything needs to be added to the batch, + try!(shredder.add_all_to_batch(seq, &mut self.batch.as_mut().unwrap())); + self.id_str_in_batch.insert(docid.clone()); + Ok(docid) } - // Store the current batch - pub fn flush(&mut self) -> Result<(), Error> { - // Flush can only be called if the index is open + /// Returns Ok(true) if the document was found and deleted, Ok(false) if it could not be found + pub fn delete(&mut self, docid: &str) -> Result { if self.rocks.is_none() { return Err(Error::Write("Index isn't open.".to_string())); } - let rocks = self.rocks.as_ref().unwrap(); + if self.id_str_in_batch.contains(docid) { + // oops use trying to delete a doc that's in the batch. Can't happen, + return Err(Error::Write("Attempt to delete doc with same _id added earlier" + .to_string())); + } + if let Some((seq, key_values)) = try!(self.gather_doc_fields(docid)) { + let mut shredder = Shredder::new(); + try!(shredder.delete_existing_doc(docid, seq, key_values, + &mut self.batch.as_mut().unwrap())); + Ok(true) + } else { + Ok(false) + } + } - // Look up all doc ids and 'delete' from the seq_to_ids keyspace - for key in self.id_str_to_id_seq.keys() { - // TODO vmx 2016-10-17: USe multiget once the Rusts wrapper supports it - match rocks.get(key.as_bytes()) { - Ok(Some(seq)) => { - try!(self.batch.as_mut().unwrap().delete(&*seq)); - }, - _ => {} + fn gather_doc_fields(&self, docid: &str) -> + Result>)>, Error> { + if let Some(seq) = try!(self.fetch_seq(&docid)) { + // collect up all the fields for the existing doc + let kb = KeyBuilder::new(); + let value_key = kb.value_key(seq); + let mut key_values = BTreeMap::new(); + + let mut iter = self.rocks.as_ref().unwrap().iterator(IteratorMode::Start); + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + loop { + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => break, + }; + + if !key.starts_with(value_key.as_bytes()) { + break; + } + let key = unsafe{ str::from_utf8_unchecked(&key)}.to_string(); + let value = value.iter().map(|i|*i).collect(); + key_values.insert(key, value); } + return Ok(Some((seq, key_values))); + } else { + return Ok(None); } + } - // Add the ids_to_seq keyspace entry - for (id, seq) in &self.id_str_to_id_seq { - try!(self.batch.as_mut().unwrap().put(id.as_bytes(), seq.as_bytes())); + // Store the current batch + pub fn flush(&mut self) -> Result<(), Error> { + // Flush can only be called if the index is open + if self.rocks.is_none() { + return Err(Error::Write("Index isn't open.".to_string())); } + let rocks = self.rocks.as_ref().unwrap(); let mut bytes = Vec::with_capacity(8*2); bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); @@ -139,7 +195,7 @@ impl Index { // Make sure there's a always a valid WriteBarch after writing it into RocksDB, // else calls to `self.batch.as_mut().unwrap()` would panic. self.batch = Some(rocksdb::WriteBatch::default()); - self.id_str_to_id_seq.clear(); + self.id_str_in_batch.clear(); Ok(status) } @@ -160,46 +216,51 @@ impl Index { unsafe{ mem::transmute(val) } } - pub fn convert_bytes_to_u32(bytes: &[u8]) -> u32 { + pub fn convert_bytes_to_i32(bytes: &[u8]) -> i32 { let mut vec = Vec::with_capacity(bytes.len()); vec.extend(bytes.into_iter()); let mut read = Cursor::new(vec); - read.read_unsigned_varint_32().unwrap() + read.read_signed_varint_32().unwrap() } - pub fn convert_u32_to_bytes(val: u32) -> Vec { + pub fn convert_i32_to_bytes(val: i32) -> Vec { let mut bytes = Cursor::new(Vec::new()); - assert!(bytes.write_unsigned_varint_32(val).is_ok()); + assert!(bytes.write_signed_varint_32(val).is_ok()); bytes.into_inner() } - pub fn fetch_id(&self, seq: u64) -> Result, String> { - // Fetching an ID is only possible if the index is open + pub fn fetch_seq(&self, id: &str) -> Result, Error> { + // Fetching an seq is only possible if the index is open // NOTE vmx 2016-10-17: Perhaps that shouldn't panic? assert!(&self.rocks.is_some()); let rocks = self.rocks.as_ref().unwrap(); - let key = format!("S{}", seq); + let key = format!("I{}", id); match try!(rocks.get(&key.as_bytes())) { // If there is an id, it's UTF-8 - Some(id) => Ok(Some(id.to_utf8().unwrap().to_string())), + Some(bytes) => Ok(Some(bytes.to_utf8().unwrap().parse().unwrap())), None => Ok(None) } } - fn compare_keys(a: &[u8], b: &[u8]) -> i32 { - use std::cmp::Ordering; - use key_builder::KeyBuilder; + fn compaction_filter(_level: u32, key: &[u8], value: &[u8]) -> CompactionDecision { + if !(key[0] as char == 'F' || key[0] as char == 'K') { + return CompactionDecision::Keep; + } + if 0 == Index::convert_bytes_to_i32(&value) { + CompactionDecision::Remove + } else { + CompactionDecision::Keep + } + } + + fn compare_keys(a: &[u8], b: &[u8]) -> Ordering { if a[0] == 'W' as u8 && b[0] == 'W' as u8 { let astr = unsafe {str::from_utf8_unchecked(&a)}; let bstr = unsafe {str::from_utf8_unchecked(&b)}; KeyBuilder::compare_keys(astr, bstr) } else { - match a.cmp(b) { - Ordering::Less => -1, - Ordering::Greater => 1, - Ordering::Equal => 0, - } + a.cmp(b) } } @@ -212,16 +273,15 @@ impl Index { } let mut count = if let Some(bytes) = existing_val { - Index::convert_bytes_to_u32(&bytes) + Index::convert_bytes_to_i32(&bytes) } else { 0 }; for bytes in operands { - count += Index::convert_bytes_to_u32(&bytes); + count += Index::convert_bytes_to_i32(&bytes); } - - Index::convert_u32_to_bytes(count) + Index::convert_i32_to_bytes(count) } } @@ -230,15 +290,64 @@ impl Index { mod tests { extern crate rocksdb; use super::{Index, OpenOptions}; + use query::Query; + use std::str; #[test] fn test_open() { let dbname = "target/tests/firstnoisedb"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); //let db = super::Index::open("firstnoisedb", Option::None).unwrap(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); index.flush().unwrap(); } + + #[test] + fn test_uuid() { + let dbname = "target/tests/testuuid"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let id = index.add(r#"{"foo":"bar"}"#).unwrap(); + + index.flush().unwrap(); + + let mut results = Query::get_matches(r#"find {foo:=="bar"}"#, &index).unwrap(); + let query_id = results.get_next_id().unwrap().unwrap(); + assert!(query_id.len() == 32); + assert_eq!(query_id, id); + } + + #[test] + fn test_compaction() { + let dbname = "target/tests/testcompaction"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let id = index.add(r#"{"foo":"bar"}"#).unwrap(); + index.flush().unwrap(); + + index.delete(&id).unwrap(); + index.flush().unwrap(); + + let rocks = index.rocks.as_mut().unwrap(); + + // apparently you need to do compaction twice when there are merges + // first one lets the merges happen, the second lets them be collected. + // this is acceptable since eventually the keys go away. + // if this test fails non-deterministically we might have a problem. + rocks.compact_range(None, None); + rocks.compact_range(None, None); + + let mut iter = rocks.iterator(rocksdb::IteratorMode::Start); + let (key, _value) = iter.next().unwrap(); + assert!(key.starts_with(&b"HDB"[..])); + assert!(iter.next().is_none()); + } } diff --git a/src/json_shred.rs b/src/json_shred.rs index c64ac2b..cbea30a 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -2,11 +2,13 @@ extern crate rocksdb; extern crate rustc_serialize; extern crate varint; -use std::collections::HashMap; +use std::collections::{HashMap, BTreeMap}; use std::mem::transmute; use std::io::Write; use std::str::Chars; use std::io::Cursor; +use std::str; +use std::mem; use self::varint::VarintWrite; use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; @@ -20,8 +22,6 @@ use index::Index; // Callback based JSON streaming parser: https://github.com/gyscos/json-streamer.rs // Another parser pased on rustc_serializ: https://github.com/isagalaev/ijson-rust/blob/master/src/test.rs#L11 -type ArrayOffsets = Vec; - enum ObjectKeyTypes { /// _id field Id, @@ -34,78 +34,113 @@ enum ObjectKeyTypes { #[derive(Debug)] pub struct Shredder { kb: KeyBuilder, - doc_id: String, - object_keys_indexed: Vec + doc_id: Option, + object_keys_indexed: Vec, + shredded_key_values: BTreeMap>, + existing_key_value_to_delete: BTreeMap>, } - impl Shredder { pub fn new() -> Shredder { - Shredder{ + Shredder { kb: KeyBuilder::new(), - doc_id: String::new(), + doc_id: None, object_keys_indexed: Vec::new(), + shredded_key_values: BTreeMap::new(), + existing_key_value_to_delete: BTreeMap::new(), } } - fn add_entries(&mut self, text: &String, docseq: u64, batch: &mut rocksdb::WriteBatch) -> - Result<(), Error> { - let stems = Stems::new(text.as_str()); + fn add_entries(&mut self, text: &str, docseq: u64, + batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { + let stems = Stems::new(text); let mut word_to_word_positions = HashMap::new(); - let mut total_words: u32 = 0; + let mut total_words: i32 = 0; let mut one_enc_bytes = Cursor::new(Vec::new()); - assert!(one_enc_bytes.write_unsigned_varint_32(1).is_ok()); + let num = if delete {-1} else {1}; + assert!(one_enc_bytes.write_signed_varint_32(num).is_ok()); + for stem in stems { total_words += 1; - let &mut (ref mut word_positions, ref mut count) = word_to_word_positions.entry(stem.stemmed) - .or_insert((Cursor::new(Vec::new()), 0)); - assert!(word_positions.write_unsigned_varint_32(stem.word_pos).is_ok()); + let &mut (ref mut word_positions, ref mut count) = + word_to_word_positions.entry(stem.stemmed) + .or_insert((Cursor::new(Vec::new()), 0)); + if !delete { + assert!(word_positions.write_unsigned_varint_32(stem.word_pos).is_ok()); + } *count += 1; } for (stemmed, (word_positions, count)) in word_to_word_positions { let key = self.kb.stemmed_word_key(&stemmed, docseq); - try!(batch.put(&key.into_bytes(), &word_positions.into_inner())); + if delete { + try!(batch.delete(&key.into_bytes())); + } else { + try!(batch.put(&key.into_bytes(), &word_positions.into_inner())); + } let key = self.kb.field_length_key(docseq); - try!(batch.put(&key.into_bytes(), &Index::convert_u32_to_bytes(total_words))); + if delete { + try!(batch.delete(&key.into_bytes())); + } else { + try!(batch.put(&key.into_bytes(), &Index::convert_i32_to_bytes(total_words))); + } let key = self.kb.keypathword_count_key(&stemmed); - try!(batch.merge(&key.into_bytes(), &Index::convert_u32_to_bytes(count))); + if delete { + try!(batch.merge(&key.into_bytes(), &Index::convert_i32_to_bytes(-count))); + } else { + try!(batch.merge(&key.into_bytes(), &Index::convert_i32_to_bytes(count))); + } let key = self.kb.keypath_count_key(); try!(batch.merge(&key.into_bytes(), one_enc_bytes.get_ref())); } let key = self.kb.value_key(docseq); - let mut buffer = String::with_capacity(text.len() + 1); - buffer.push('s'); - buffer.push_str(&text); + if delete { + try!(batch.delete(&key.into_bytes())); + } else { + let mut buffer = String::with_capacity(text.len() + 1); + buffer.push('s'); + buffer.push_str(&text); + + try!(batch.put(&key.into_bytes(), &buffer.as_bytes())); + } - try!(batch.put(&key.into_bytes(), &buffer.as_bytes())); + let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); + if delete { + try!(batch.delete(&key.into_bytes())); + } else { + try!(batch.put(&key.into_bytes(), &docseq.to_string().as_bytes())); + } Ok(()) } - fn add_value(&mut self, code: char, value: &[u8], - docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { - let key = self.kb.value_key(docseq); + fn add_value(&mut self, code: char, value: &[u8]) -> Result<(), Error> { + let key = self.kb.value_key_path_only(); let mut buffer = Vec::with_capacity(value.len() + 1); buffer.push(code as u8); - try!((&mut buffer as &mut Write).write_all(&value)); - - try!(batch.put(&key.into_bytes(), &buffer.as_ref())); - + try!((&mut buffer as &mut Write).write_all(value)); + self.shredded_key_values.insert(key, buffer); Ok(()) } - fn maybe_add_value(&mut self, parser: &Parser, code: char, value: &[u8], - docseq: u64, batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + fn maybe_add_value(&mut self, parser: &Parser, code: char, value: &[u8]) -> Result<(), Error> { match self.extract_key(parser.stack().top()) { ObjectKeyTypes::Id => { - return Err(Error::Shred( - "Expected string for `_id` field, got another type".to_string())); + if code != 's' && self.kb.keypath_segments_len() == 1 { + //nested fields can be _id, not root fields + return Err(Error::Shred( + "Expected string for `_id` field, got another type".to_string())); + } + self.doc_id = Some(unsafe{ str::from_utf8_unchecked(value) }.to_string()); + self.kb.pop_object_key(); + self.kb.push_object_key("_id"); + *self.object_keys_indexed.last_mut().unwrap() = true; + try!(self.add_value(code, &value)); }, ObjectKeyTypes::Key(key) => { // Pop the dummy object that makes ObjectEnd happy @@ -113,15 +148,16 @@ impl Shredder { self.kb.pop_object_key(); self.kb.push_object_key(&key); *self.object_keys_indexed.last_mut().unwrap() = true; - try!(self.add_value(code, &value, docseq, batch)); + try!(self.add_value(code, &value)); }, ObjectKeyTypes::NoKey => { - try!(self.add_value(code, &value, docseq, batch)); + try!(self.add_value(code, &value)); self.kb.inc_top_array_offset(); }, } Ok(()) } + // Extract key if it exists and indicates if it's a special type of key fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { match stack_element { @@ -153,8 +189,88 @@ impl Shredder { Ok(()) } - pub fn shred(&mut self, json: &str, docseq: u64, batch: &mut rocksdb::WriteBatch) -> - Result { + pub fn add_all_to_batch(&mut self, seq: u64, + batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + let mut key_values = BTreeMap::new(); + mem::swap(&mut key_values, &mut self.existing_key_value_to_delete); + for (key, value) in key_values.into_iter() { + self.kb.clear(); + self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); + if value[0] as char == 's' { + let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + try!(self.add_entries(text, seq, batch, true)); + } else { + try!(batch.delete(&key.into_bytes())); + } + } + let mut key_values = BTreeMap::new(); + mem::swap(&mut key_values, &mut self.shredded_key_values); + for (key, value) in key_values.into_iter() { + self.kb.clear(); + self.kb.parse_value_key_path_only(&key); + if value[0] as char == 's' { + let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + try!(self.add_entries(text, seq, batch, false)); + } else { + let key = self.kb.value_key(seq); + try!(batch.put(&key.into_bytes(), &value.as_ref())); + } + } + Ok(()) + } + + pub fn delete_existing_doc(&mut self, + docid: &str, + seq: u64, + existing: BTreeMap>, + batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + self.doc_id = Some(docid.to_string()); + for (key, value) in existing.into_iter() { + self.kb.clear(); + self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); + if value[0] as char == 's' { + let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + try!(self.add_entries(text, seq, batch, true)); + } else { + try!(batch.delete(&key.into_bytes())); + } + } + Ok(()) + } + + pub fn merge_existing_doc(&mut self, existing: BTreeMap>) { + // we found doc with the same id already stored on disk. We need to delete + // the doc. But any fields that are the same we can just keep around + // and don't even need to reindex. + for (existing_key, existing_value) in existing { + let matches = { + let key = KeyBuilder::value_key_path_only_from_str(&existing_key); + if let Some(new_value) = self.shredded_key_values.get(key) { + *new_value == existing_value + } else { + false + } + }; + if matches { + // we don't need to write or index these values, they already exist! + let key = KeyBuilder::value_key_path_only_from_str(&existing_key); + self.shredded_key_values.remove(key).unwrap(); + } else { + // we need to delete these keys and the index keys assocaited with the valuess + self.existing_key_value_to_delete.insert(existing_key, existing_value); + } + } + } + + pub fn add_id(&mut self, id: &str) -> Result<(), Error> { + self.doc_id = Some(id.to_string()); + self.kb.clear(); + self.kb.push_object_key("_id"); + try!(self.add_value('s', &id.as_bytes())); + Ok(()) + } + + pub fn shred(&mut self, json: &str) -> Result, Error> { let mut parser = Parser::new(json.chars()); loop { // Get the next token, so that in case of an `ObjectStart` the key is already @@ -171,7 +287,7 @@ impl Shredder { if !self.object_keys_indexed.pop().unwrap() { // this means we never wrote a key because the object was empty. // So preserve the empty object by writing a special value. - try!(self.maybe_add_value(&parser, 'o', &[], docseq, batch)); + try!(self.maybe_add_value(&parser, 'o', &[])); } self.kb.inc_top_array_offset(); }, @@ -184,57 +300,35 @@ impl Shredder { // this means we never wrote a value because the object was empty. // So preserve the empty array by writing a special value. self.kb.pop_array(); - try!(self.maybe_add_value(&parser, 'a', &[], docseq, batch)); + try!(self.maybe_add_value(&parser, 'a', &[])); } else { self.kb.pop_array(); } self.kb.inc_top_array_offset(); }, Some(JsonEvent::StringValue(value)) => { - match self.extract_key(parser.stack().top()) { - ObjectKeyTypes::Id => { - self.doc_id = value.clone(); - self.kb.pop_object_key(); - self.kb.push_object_key("_id"); - *self.object_keys_indexed.last_mut().unwrap() = true; - - try!(self.add_entries(&value, docseq, batch)); - }, - ObjectKeyTypes::Key(key) => { - // Pop the dummy object that makes ObjectEnd happy - // or the previous object key - self.kb.pop_object_key(); - self.kb.push_object_key(&key); - *self.object_keys_indexed.last_mut().unwrap() = true; - - try!(self.add_entries(&value, docseq, batch)); - }, - ObjectKeyTypes::NoKey => { - try!(self.add_entries(&value, docseq, batch)); - self.kb.inc_top_array_offset(); - }, - } + try!(self.maybe_add_value(&parser, 's', &value.as_bytes())); }, Some(JsonEvent::BooleanValue(tf)) => { let code = if tf {'T'} else {'F'}; - try!(self.maybe_add_value(&parser, code, &[], docseq, batch)); + try!(self.maybe_add_value(&parser, code, &[])); }, Some(JsonEvent::I64Value(i)) => { let f = i as f64; let bytes = unsafe{ transmute::(f) }; - try!(self.maybe_add_value(&parser, 'f', &bytes[..], docseq, batch)); + try!(self.maybe_add_value(&parser, 'f', &bytes[..])); }, Some(JsonEvent::U64Value(u)) => { let f = u as f64; let bytes = unsafe{ transmute::(f) }; - try!(self.maybe_add_value(&parser, 'f', &bytes[..], docseq, batch)); + try!(self.maybe_add_value(&parser, 'f', &bytes[..])); }, Some(JsonEvent::F64Value(f)) => { let bytes = unsafe{ transmute::(f) }; - try!(self.maybe_add_value(&parser, 'f', &bytes[..], docseq, batch)); + try!(self.maybe_add_value(&parser, 'f', &bytes[..])); }, Some(JsonEvent::NullValue) => { - try!(self.maybe_add_value(&parser, 'N', &[], docseq, batch)); + try!(self.maybe_add_value(&parser, 'N', &[])); }, Some(JsonEvent::Error(error)) => { return Err(Error::Shred(error.to_string())); @@ -245,10 +339,9 @@ impl Shredder { }; } Ok(self.doc_id.clone()) - } + } } - #[cfg(test)] mod tests { extern crate rocksdb; @@ -280,16 +373,18 @@ mod tests { } - #[test] + #[test] fn test_shred_nested() { let mut shredder = super::Shredder::new(); let json = r#"{"some": ["array", "data", ["also", "nested"]]}"#; let docseq = 123; let mut batch = rocksdb::WriteBatch::default(); - shredder.shred(json, docseq, &mut batch).unwrap(); + shredder.shred(json).unwrap(); + shredder.add_id("foo").unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); let dbname = "target/tests/test_shred_netsted"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); @@ -299,6 +394,7 @@ mod tests { let result = positions_from_rocks(&rocks); let expected = vec![ + ("W._id!foo#123,".to_string(), vec![0]), ("W.some$!array#123,0".to_string(), vec![0]), ("W.some$!data#123,1".to_string(), vec![0]), ("W.some$$!also#123,2,0".to_string(), vec![0]), @@ -317,10 +413,11 @@ mod tests { let json = r#"{"A":[{"B":"B2VMX two three","C":"..C2"},{"B": "b1","C":"..C2"}]}"#; let docseq = 1234; let mut batch = rocksdb::WriteBatch::default(); - shredder.shred(json, docseq, &mut batch).unwrap(); + shredder.shred(json).unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); let dbname = "target/tests/test_shred_objects"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); @@ -348,10 +445,11 @@ mod tests { let json = r#"{}"#; let docseq = 123; let mut batch = rocksdb::WriteBatch::default(); - shredder.shred(json, docseq, &mut batch).unwrap(); + shredder.shred(json).unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); let dbname = "target/tests/test_shred_empty_object"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); diff --git a/src/key_builder.rs b/src/key_builder.rs index 62977f3..7ea41a1 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -2,6 +2,7 @@ extern crate unicode_normalization; use query::DocResult; use std::str; +use std::cmp::Ordering; use self::unicode_normalization::UnicodeNormalization; @@ -25,6 +26,11 @@ impl KeyBuilder { } } + pub fn clear(&mut self) { + self.keypath.clear(); + self.arraypath.clear(); + } + pub fn get_keypathword_only(&self, word: &str) -> String { let mut string = String::with_capacity(100); string.push('W'); @@ -57,6 +63,13 @@ impl KeyBuilder { string } + pub fn id_to_seq_key(&self, id: &str) -> String { + let mut str = String::with_capacity(id.len() + 1); + str.push('I'); + str.push_str(&id); + str + } + /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath /// built up internally. pub fn stemmed_word_key(&self, word: &str, seq: u64) -> String { @@ -140,6 +153,28 @@ impl KeyBuilder { } string } + + /// Returns a value key without the doc seq prepended. + pub fn value_key_path_only_from_str(str: &str) -> &str { + &str[str.find('#').unwrap() + 1..] + } + + /// parses a value_key_path_only and sets the internally elements appropriately + pub fn parse_value_key_path_only(&mut self, mut str: &str) { + while let Some(tuple) = KeyBuilder::parse_first_key_value_segment(str) { + match tuple { + (Segment::ObjectKey(_key), unescaped) => { + str = &str[unescaped.len()..]; + self.keypath.push(unescaped); + }, + (Segment::Array(i), unescaped) => { + str = &str[unescaped.len()..]; + self.keypath.push("$".to_string()); + self.arraypath.push(i); + }, + } + } + } pub fn value_key_from_doc_result(&self, dr: &DocResult) -> String { let mut string = String::with_capacity(100); @@ -309,8 +344,7 @@ impl KeyBuilder { dr } - pub fn compare_keys(akey: &str, bkey: &str) -> i32 { - use std::cmp::Ordering; + pub fn compare_keys(akey: &str, bkey: &str) -> Ordering { debug_assert!(akey.starts_with('W')); debug_assert!(bkey.starts_with('W')); let (apath_str, aseq_str, aarraypath_str) = @@ -319,22 +353,18 @@ impl KeyBuilder { KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); match apath_str[1..].cmp(&bpath_str[1..]) { - Ordering::Less => -1, - Ordering::Greater => 1, + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, Ordering::Equal => { let aseq: u64 = aseq_str.parse().unwrap(); let bseq: u64 = bseq_str.parse().unwrap();; if aseq < bseq { - -1 + Ordering::Less } else if aseq > bseq { - 1 + Ordering::Greater } else { if aarraypath_str.is_empty() || barraypath_str.is_empty() { - match aarraypath_str.len().cmp(&barraypath_str.len()) { - Ordering::Less => -1, - Ordering::Greater => 1, - Ordering::Equal => 0, - } + aarraypath_str.len().cmp(&barraypath_str.len()) } else { let mut a_nums = aarraypath_str.split(","); let mut b_nums = barraypath_str.split(","); @@ -344,21 +374,22 @@ impl KeyBuilder { let a_num: u64 = a_num_str.parse().unwrap(); let b_num: u64 = b_num_str.parse().unwrap(); match a_num.cmp(&b_num) { - Ordering::Less => return -1, - Ordering::Greater => return 1, + Ordering::Less => return Ordering::Less, + Ordering::Greater => return Ordering::Greater, Ordering::Equal => (), } } else { //b is shorter than a, so greater - return 1; + return Ordering::Greater; } } else { if b_nums.next().is_some() { //a is shorter than b so less - return -1; + return Ordering::Less; } else { - // same length and must have hit all equal before this, so equal - return 0; + // same length and must have hit all equal before this, + // so equal + return Ordering::Equal; } } } diff --git a/src/parser.rs b/src/parser.rs index 1973ffd..437a19d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1079,7 +1079,7 @@ mod tests { #[test] fn test_whitespace() { let dbname = "target/tests/test_whitespace"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); @@ -1101,7 +1101,7 @@ mod tests { #[test] fn test_must_consume_string_literal() { let dbname = "target/tests/test_must_consume_string_literal"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); diff --git a/src/query.rs b/src/query.rs index 9b8482c..f8f6c99 100644 --- a/src/query.rs +++ b/src/query.rs @@ -863,7 +863,7 @@ mod tests { #[test] fn test_query_hello_world() { let dbname = "target/tests/querytestdbhelloworld"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); @@ -878,7 +878,7 @@ mod tests { #[test] fn test_query_more_docs() { let dbname = "target/tests/querytestdbmoredocs"; - let _ = Index::delete(dbname); + let _ = Index::drop(dbname); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); diff --git a/src/repl.rs b/src/repl.rs index 8c092ad..12fd148 100644 --- a/src/repl.rs +++ b/src/repl.rs @@ -6,7 +6,8 @@ use std::io::{Write, BufRead}; fn is_command(str: &str) -> bool { - let commands = ["find", "add", "create", "drop", "open", "pretty"]; + let commands = ["find", "add", "create", "drop", "open", + "pretty", "commit", "del", "load"]; for command in commands.iter() { if str.starts_with(command) { return true; @@ -75,7 +76,7 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { } } else if lines.starts_with("drop") { let dbname = lines[4..].trim_left(); - match Index::delete(dbname) { + match Index::drop(dbname) { Ok(()) => (), Err(reason) => write!(w, "{}\n", reason).unwrap(), } @@ -90,6 +91,16 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { Ok(id) => write!(w, "{}\n", JsonValue::str_to_literal(&id)).unwrap(), Err(reason) => write!(w, "{}\n", reason).unwrap(), } + } else if lines.starts_with("del") { + match index.delete(&lines[3..].trim_left()) { + Ok(true) => write!(w, "ok\n").unwrap(), + Ok(false) => write!(w, "not found\n").unwrap(), + Err(reason) => write!(w, "{}\n", reason).unwrap(), + } + } else if lines.starts_with("commit") { + if let Err(reason) = index.flush() { + write!(w, "{}\n", reason).unwrap(); + } } else if lines.starts_with("find") { if let Err(reason) = index.flush() { write!(w, "{}\n", reason).unwrap(); diff --git a/tests/repl_tests.rs b/tests/repl_tests.rs index a2bdbae..aa44b8a 100644 --- a/tests/repl_tests.rs +++ b/tests/repl_tests.rs @@ -26,18 +26,18 @@ fn test_repl() { continue; } total += 1; + let test_name = path.file_name().unwrap().to_str().unwrap().to_string(); + println!("About to run test {} ", test_name); let mut file = File::open(path.clone()).unwrap(); let mut file_buffer = Vec::new(); file.read_to_end(&mut file_buffer).unwrap(); let mut test_result_buffer = Vec::new(); let file = File::open(path.clone()).unwrap(); - repl(&mut BufReader::new(file), &mut test_result_buffer, true); if file_buffer != test_result_buffer { failures += 1; - let test_name = path.file_name().unwrap().to_str().unwrap().to_string(); path.set_extension("reject"); let reject = path.file_name().unwrap().to_str().unwrap().to_string(); From 1eddff3db0dae504807b3b811b7f0ad11d88e6e7 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 18 Jan 2017 12:23:25 -0800 Subject: [PATCH 065/122] Fixed bug where trailing unnormalized text causes panic The code in one part of the codes calls is_alphabetic() on normalized text, and in another on pre-normalized text. Made them both call on normalized text. Test case added. --- src/stems.rs | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/stems.rs b/src/stems.rs index 16dcc83..ecc33b2 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -38,13 +38,16 @@ impl<'a> Iterator for Stems<'a> { type Item = StemmedWord; fn next(&mut self) -> Option { - let mut word_to_stem = String::new(); - let mut normalized = String::new(); + // first we loop though until we find alphabetic chars. That becomes our stem word. + let mut word_to_stem = String::new(); // will contain any leading non-alphabetic chars + // iff no other alphabetic chars + + let mut normalized = String::new(); // will contain the first alphabetic chars loop { match self.words.peek() { Some(&(_pos, word)) => { normalized = word.nfkc().collect::(); - if word.chars().next().unwrap().is_alphabetic() { + if normalized.chars().next().unwrap().is_alphabetic() { break; } else { word_to_stem.push_str(&normalized); @@ -58,7 +61,7 @@ impl<'a> Iterator for Stems<'a> { // in this case we were passed an empty string // so we don't just return None, but we return // an empty string Stemmed word. - // otherwise searching fields with empty strings + // otherwise searching fields for empty strings // wouldn't be possible. return Some(StemmedWord { word_pos: 0, @@ -75,9 +78,10 @@ impl<'a> Iterator for Stems<'a> { } if !word_to_stem.is_empty() { - // we found the begining of the string is not a stemmable word. + // we found the string is not a stemmable word. // Return the accumulated string as the stemmed word debug_assert!(self.word_position == 0); + self.word_position += 1; return Some(StemmedWord { word_pos: 0, @@ -88,14 +92,14 @@ impl<'a> Iterator for Stems<'a> { self.words.next(); word_to_stem = normalized; loop { - // loop through all non-alphabetic chars and add to suffix - match self.words.peek() { + // loop through all non-alphabetic chars discarding them. + match self.words.peek() { // peek to avoid advancing iter Some(&(_pos, word)) => { normalized = word.nfkc().collect::(); if normalized.chars().next().unwrap().is_alphabetic() { - break; + break; // now we'll get these on the next() call } else { - self.words.next(); + self.words.next(); //advance the iter } }, None => break, @@ -172,6 +176,20 @@ mod tests { } } + #[test] + fn test_stems_trailing_needs_normalized() { + let input = r#"Didgeridoos™"#; + let result = Stems::new(input).collect::>(); + let expected = vec![ + StemmedWord { word_pos: 0, stemmed: String::from("didgeridoo")}, + StemmedWord { word_pos: 1, stemmed: String::from("tm")}, + ]; + assert_eq!(result.len(), expected.len()); + for (stem, expected_stem) in result.iter().zip(expected.iter()) { + assert_eq!(stem, expected_stem); + } + } + #[test] fn test_stems_unicode_lowercase_has_more_bytes() { let input = "İ"; From eec3618ba9e635448d6c42a1717bca45ad876ff8 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 18 Jan 2017 12:28:27 -0800 Subject: [PATCH 066/122] Change stemmer to return non-alpha chars if whole sting is non-alpha The will make the index smaller but will will still offer efficiencies since at least something can be indexed for any string. --- src/stems.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/stems.rs b/src/stems.rs index ecc33b2..0974a05 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -48,6 +48,7 @@ impl<'a> Iterator for Stems<'a> { Some(&(_pos, word)) => { normalized = word.nfkc().collect::(); if normalized.chars().next().unwrap().is_alphabetic() { + word_to_stem.clear(); break; } else { word_to_stem.push_str(&normalized); @@ -152,9 +153,8 @@ mod tests { let input = "@!? Let's seeing..."; let result = Stems::new(input).collect::>(); let expected = vec![ - StemmedWord { word_pos: 0, stemmed: String::from("@!? ")}, - StemmedWord { word_pos: 1, stemmed: String::from("let")}, - StemmedWord { word_pos: 2, stemmed: String::from("see")}, + StemmedWord { word_pos: 0, stemmed: String::from("let")}, + StemmedWord { word_pos: 1, stemmed: String::from("see")}, ]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { From 5b64302b6e292eed733a3bff9235e26d849bb413 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 18 Jan 2017 14:31:01 -0800 Subject: [PATCH 067/122] flush uncommitted data on close from repl Since the user likely to forget! --- src/index.rs | 10 +++++++--- src/repl.rs | 6 ++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/index.rs b/src/index.rs index 6ef57e4..4003456 100644 --- a/src/index.rs +++ b/src/index.rs @@ -87,6 +87,10 @@ impl Index { Ok(()) } + pub fn is_open(&self) -> bool { + self.rocks.is_some() + } + //This deletes the Rockdbs instance from disk pub fn drop(name: &str) -> Result<(), Error> { let ret = try!(rocksdb::DB::destroy(&rocksdb::Options::default(), name)); @@ -94,7 +98,7 @@ impl Index { } pub fn add(&mut self, json: &str) -> Result { - if self.rocks.is_none() { + if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); } let mut shredder = Shredder::new(); @@ -129,7 +133,7 @@ impl Index { /// Returns Ok(true) if the document was found and deleted, Ok(false) if it could not be found pub fn delete(&mut self, docid: &str) -> Result { - if self.rocks.is_none() { + if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); } if self.id_str_in_batch.contains(docid) { @@ -181,7 +185,7 @@ impl Index { // Store the current batch pub fn flush(&mut self) -> Result<(), Error> { // Flush can only be called if the index is open - if self.rocks.is_none() { + if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); } let rocks = self.rocks.as_ref().unwrap(); diff --git a/src/repl.rs b/src/repl.rs index 12fd148..7885c82 100644 --- a/src/repl.rs +++ b/src/repl.rs @@ -48,6 +48,12 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { } } } else { + // commit anything written + if index.is_open() { + if let Err(reason) = index.flush() { + write!(w, "{}\n", reason).unwrap(); + } + } return; } if test_mode { From f7f90884fe3163250074385fa57073463e3109ac Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 26 Jan 2017 09:08:55 -0800 Subject: [PATCH 068/122] Simplified stemming code Complex code remained where changes made where we now store original text in original document. --- src/stems.rs | 115 ++++++++++++++++++--------------------------------- 1 file changed, 40 insertions(+), 75 deletions(-) diff --git a/src/stems.rs b/src/stems.rs index 0974a05..9adc10f 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -2,15 +2,13 @@ extern crate stemmer; extern crate unicode_normalization; extern crate unicode_segmentation; -use std::iter::Peekable; - use self::stemmer::Stemmer; use self::unicode_normalization::UnicodeNormalization; use self::unicode_segmentation::UnicodeSegmentation; pub struct Stems<'a> { - words: Peekable>, + words: unicode_segmentation::UWordBoundIndices<'a>, stemmer: Stemmer, word_position: usize, } @@ -27,7 +25,7 @@ pub struct StemmedWord { impl<'a> Stems<'a> { pub fn new(text: &str) -> Stems { Stems{ - words: text.split_word_bound_indices().peekable(), + words: text.split_word_bound_indices(), stemmer: Stemmer::new("english").unwrap(), word_position: 0, } @@ -38,81 +36,48 @@ impl<'a> Iterator for Stems<'a> { type Item = StemmedWord; fn next(&mut self) -> Option { - // first we loop though until we find alphabetic chars. That becomes our stem word. - let mut word_to_stem = String::new(); // will contain any leading non-alphabetic chars - // iff no other alphabetic chars - - let mut normalized = String::new(); // will contain the first alphabetic chars - loop { - match self.words.peek() { - Some(&(_pos, word)) => { - normalized = word.nfkc().collect::(); - if normalized.chars().next().unwrap().is_alphabetic() { - word_to_stem.clear(); - break; - } else { - word_to_stem.push_str(&normalized); - self.words.next(); - } - }, - None => { - if word_to_stem.is_empty() { - if self.word_position == 0 { - self.word_position = 1; - // in this case we were passed an empty string - // so we don't just return None, but we return - // an empty string Stemmed word. - // otherwise searching fields for empty strings - // wouldn't be possible. - return Some(StemmedWord { - word_pos: 0, - stemmed: String::new(), - }); - } else { - return None; - } - } else { - break; - } - }, + // we loop though until we find alphabetic chars. That becomes our stem word. + let mut non_alpha = String::new(); // will contain any non-alphabetic chars + // returned iff no other alphabetic chars + while let Some((_pos, word)) = self.words.next() { + let normalized = word.nfkc().collect::(); + if normalized.chars().next().unwrap().is_alphabetic() { + let pos = self.word_position; + self.word_position += 1; + return Some(StemmedWord { + word_pos: pos as u32, + stemmed: self.stemmer.stem(&normalized.to_lowercase()), + }); + } else { + non_alpha.push_str(&normalized); } - } - - if !word_to_stem.is_empty() { - // we found the string is not a stemmable word. - // Return the accumulated string as the stemmed word - debug_assert!(self.word_position == 0); - - self.word_position += 1; - return Some(StemmedWord { - word_pos: 0, - stemmed: word_to_stem, + } + if non_alpha.is_empty() { + if self.word_position == 0 { + self.word_position = 1; + // in this case we were passed an empty string + // so we don't just return None, but we return + // an empty string Stemmed word. + // otherwise searching fields for empty strings + // wouldn't be possible. + return Some(StemmedWord { + word_pos: 0, + stemmed: String::new(), }); - } - // normalized contains our stemmable word. advance the iter since we only peeked. - self.words.next(); - word_to_stem = normalized; - loop { - // loop through all non-alphabetic chars discarding them. - match self.words.peek() { // peek to avoid advancing iter - Some(&(_pos, word)) => { - normalized = word.nfkc().collect::(); - if normalized.chars().next().unwrap().is_alphabetic() { - break; // now we'll get these on the next() call - } else { - self.words.next(); //advance the iter - } - }, - None => break, + } else { + return None; + } + } else { + if self.word_position == 0 { + self.word_position = 1; + return Some(StemmedWord { + word_pos: 0, + stemmed: non_alpha, + }); + } else { + return None; } } - let stemmed = self.stemmer.stem(&word_to_stem.to_lowercase()); - let ret = StemmedWord { - word_pos: self.word_position as u32, - stemmed: stemmed, - }; - self.word_position += 1; - Some(ret) } } From a4904d3d3aa1f8c3ef00e5b05e53edae687d2923 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 26 Jan 2017 09:24:24 -0800 Subject: [PATCH 069/122] Small optimization and formatting changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don’t copy non-alpha chars unless leading chars. --- src/stems.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/stems.rs b/src/stems.rs index 9adc10f..6c991f0 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -45,11 +45,13 @@ impl<'a> Iterator for Stems<'a> { let pos = self.word_position; self.word_position += 1; return Some(StemmedWord { - word_pos: pos as u32, - stemmed: self.stemmer.stem(&normalized.to_lowercase()), - }); + word_pos: pos as u32, + stemmed: self.stemmer.stem(&normalized.to_lowercase()), + }); } else { - non_alpha.push_str(&normalized); + if self.word_position == 0 { + non_alpha.push_str(&normalized); + } } } if non_alpha.is_empty() { @@ -71,9 +73,9 @@ impl<'a> Iterator for Stems<'a> { if self.word_position == 0 { self.word_position = 1; return Some(StemmedWord { - word_pos: 0, - stemmed: non_alpha, - }); + word_pos: 0, + stemmed: non_alpha, + }); } else { return None; } From b4328e2b1d7f8dc8f1c112392114e186fb60ba08 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 4 Feb 2017 13:32:38 -0800 Subject: [PATCH 070/122] Added ability to return object values nested in arrays as arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `find … return .foo[*].bar` This will return every instance of the bar field in the object nested inside the array. See the examples at the end of the query_basic.noise --- repl-tests/not.noise | 2 +- repl-tests/query_basic.noise | 22 ++++ src/json_shred.rs | 44 +++++++- src/key_builder.rs | 11 -- src/parser.rs | 67 ++++++------ src/query.rs | 11 +- src/returnable.rs | 196 +++++++++++++++++++++++++++-------- 7 files changed, 261 insertions(+), 92 deletions(-) diff --git a/repl-tests/not.noise b/repl-tests/not.noise index 9a1d9f3..ff439e7 100644 --- a/repl-tests/not.noise +++ b/repl-tests/not.noise @@ -85,7 +85,7 @@ return ._id ; find !{baz: [~="fox"]} return ._id ; -Parse error: query cannot be made up of only logical not. Must have at least match clause not negated. +Parse error: query cannot be made up of only logical not. Must have at least one match clause not negated. find !{baz: ~="fox"} && !{baz: =="foo"} return ._id ; diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise index bd327c3..cd718f0 100644 --- a/repl-tests/query_basic.noise +++ b/repl-tests/query_basic.noise @@ -233,3 +233,25 @@ find {_id: =="14"} return .; {"A":{"B":true},"_id":"14"} ] +# return everying in deeply nested arrays + +add {"_id":"15", "a":[{"b":[{"c":1},{"c":2},{"c":3}]},{"b":[{"c":4},{"c":5},{"c":6}]}]}; +"15" + +find {"_id": =="15"} +return .a[*].b[*].c; +[ +[[1,2,3],[4,5,6]] +] + +# check what happens when only some key paths exist + +add {"_id":"16", "type": "nested", "a":[{"b":[{"b":1},{"c":2},{"b":3}]},{"b":[{"c":4},{"c":5},{"c":6}]}]}; +"16" +find {"_id": =="16"} +return .a[*].b[*].c; +[ +[[2],[4,5,6]] +] + + diff --git a/src/json_shred.rs b/src/json_shred.rs index cbea30a..879464a 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -184,6 +184,7 @@ impl Shredder { // or the previous object key self.kb.pop_object_key(); self.kb.push_object_key(key); + *self.object_keys_indexed.last_mut().unwrap() = true; } } Ok(()) @@ -353,6 +354,8 @@ mod tests { use std::str; use index::{Index, OpenOptions}; + use returnable::RetValue; + use json_value::JsonValue; fn positions_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, Vec)> { let mut result = Vec::new(); @@ -372,6 +375,17 @@ mod tests { result } + fn values_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, JsonValue)> { + let mut result = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'V' { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + result.push((key_string, RetValue::bytes_to_json_value(&*value))); + } + } + result + } + #[test] fn test_shred_nested() { @@ -383,7 +397,7 @@ mod tests { shredder.add_id("foo").unwrap(); shredder.add_all_to_batch(docseq, &mut batch).unwrap(); - let dbname = "target/tests/test_shred_netsted"; + let dbname = "target/tests/test_shred_nested"; let _ = Index::drop(dbname); let mut index = Index::new(); @@ -403,6 +417,34 @@ mod tests { assert_eq!(result, expected); } + #[test] + fn test_shred_double_nested() { + let mut shredder = super::Shredder::new(); + let json = r#"{"a":{"a":"b"}}"#; + let docseq = 123; + let mut batch = rocksdb::WriteBatch::default(); + shredder.shred(json).unwrap(); + shredder.add_id("foo").unwrap(); + shredder.add_all_to_batch(docseq, &mut batch).unwrap(); + + let dbname = "target/tests/test_shred_double_nested"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let rocks = &index.rocks.unwrap(); + + rocks.write(batch).unwrap(); + let result = values_from_rocks(&rocks); + + let expected = vec![ + ("V123#._id".to_string(), JsonValue::String("foo".to_string())), + ("V123#.a.a".to_string(), JsonValue::String("b".to_string())) + ]; + assert_eq!(result, expected); + } + + #[test] // NOTE vmx 2016-12-06: This test is intentionally made to fail (hence ignored) as the current // current tokenizer does the wrong thing when it comes to numbers within words. It's left diff --git a/src/key_builder.rs b/src/key_builder.rs index 7ea41a1..8f0d34a 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -283,16 +283,6 @@ impl KeyBuilder { self.keypath.pop(); } - pub fn peek_object_key(&self) -> String { - debug_assert!(self.keypath.last().unwrap().starts_with(".")); - let x = KeyBuilder::parse_first_key_value_segment(&self.keypath.last().unwrap()); - if let Some((Segment::ObjectKey(key), _unescaped)) = x { - key - } else { - panic!("peek_object_key is messed up yo!"); - } - } - pub fn peek_array_offset(&self) -> u64 { debug_assert!(self.keypath.last().unwrap().starts_with("$")); self.arraypath.last().unwrap().clone() @@ -398,7 +388,6 @@ impl KeyBuilder { }, } } - } diff --git a/src/parser.rs b/src/parser.rs index 437a19d..0cbda8a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -10,7 +10,8 @@ use key_builder::KeyBuilder; use stems::Stems; use json_value::JsonValue; use query::{Sort, AggregateFun, SortInfo, SortField}; -use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore}; +use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore, + ReturnPath}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, NotFilter}; @@ -145,7 +146,7 @@ impl<'a, 'c> Parser<'a, 'c> { fn consume_aggregate(&mut self) -> Result, - KeyBuilder, + ReturnPath, JsonValue)>, Error> { let offset = self.offset; let mut aggregate_fun = if self.consume("group") { @@ -171,11 +172,11 @@ impl<'a, 'c> Parser<'a, 'c> { if self.consume("(") { if aggregate_fun == AggregateFun::Count { try!(self.must_consume(")")); - Ok(Some((aggregate_fun, None, KeyBuilder::new(), JsonValue::Null))) + Ok(Some((aggregate_fun, None, ReturnPath::new(), JsonValue::Null))) } else if aggregate_fun == AggregateFun::Concat { let bind_name_option = self.consume_field(); - if let Some(kb) = try!(self.consume_keypath()) { + if let Some(rp) = try!(self.consume_keypath()) { let json = if self.consume("sep") { try!(self.must_consume("=")); JsonValue::String(try!(self.must_consume_string_literal())) @@ -183,14 +184,14 @@ impl<'a, 'c> Parser<'a, 'c> { JsonValue::String(",".to_string()) }; try!(self.must_consume(")")); - Ok(Some((aggregate_fun, bind_name_option, kb, json))) + Ok(Some((aggregate_fun, bind_name_option, rp, json))) } else { Err(Error::Parse("Expected keypath or bind variable".to_string())) } } else { let bind_name_option = self.consume_field(); - if let Some(kb) = try!(self.consume_keypath()) { + if let Some(rp) = try!(self.consume_keypath()) { if self.consume("order") { try!(self.must_consume("=")); if self.consume("asc") { @@ -203,7 +204,7 @@ impl<'a, 'c> Parser<'a, 'c> { } try!(self.must_consume(")")); - Ok(Some((aggregate_fun, bind_name_option, kb, JsonValue::Null))) + Ok(Some((aggregate_fun, bind_name_option, rp, JsonValue::Null))) } else { Err(Error::Parse("Expected keypath or bind variable".to_string())) } @@ -215,7 +216,7 @@ impl<'a, 'c> Parser<'a, 'c> { } } - fn consume_keypath(&mut self) -> Result, Error> { + fn consume_keypath(&mut self) -> Result, Error> { let key: String = if self.consume(".") { if self.consume("[") { let key = try!(self.must_consume_string_literal()); @@ -227,26 +228,30 @@ impl<'a, 'c> Parser<'a, 'c> { } else { self.ws(); // this means return the whole document - return Ok(Some(KeyBuilder::new())); + return Ok(Some(ReturnPath::new())); } } } else { return Ok(None); }; - let mut kb = KeyBuilder::new(); - kb.push_object_key(&key); + let mut ret_path = ReturnPath::new(); + ret_path.push_object_key(key); loop { if self.consume("[") { if let Some(index) = try!(self.consume_integer()) { - kb.push_array_index(index as u64); + ret_path.push_array(index as u64); } else { - return Err(Error::Parse("Expected array index integer.".to_string())); + if self.consume("*") { + ret_path.push_array_all(); + } else { + return Err(Error::Parse("Expected array index integer or *.".to_string())); + } } try!(self.must_consume("]")); } else if self.consume(".") { if let Some(key) = self.consume_field() { - kb.push_object_key(&key); + ret_path.push_object_key(key); } else { return Err(Error::Parse("Expected object key.".to_string())); } @@ -255,7 +260,7 @@ impl<'a, 'c> Parser<'a, 'c> { } } self.ws(); - Ok(Some(kb)) + Ok(Some(ret_path)) } // if no boost is specified returns 1.0 @@ -776,7 +781,7 @@ impl<'a, 'c> Parser<'a, 'c> { if self.consume("sort") { let mut n = 0; loop { - if let Some(kb) = try!(self.consume_keypath()) { + if let Some(rp) = try!(self.consume_keypath()) { // doing the search for source 2x so user can order // anyway they like. Yes it's a hack, but it simple. let mut sort = if self.consume("asc") { @@ -806,9 +811,8 @@ impl<'a, 'c> Parser<'a, 'c> { sort }; - sort_infos.insert(kb.value_key(0), SortInfo{field: SortField::FetchValue(kb), - sort: sort, order_to_apply: n, - default: default}); + sort_infos.insert(rp.to_key(), SortInfo{field: SortField::FetchValue(rp), + sort: sort, order_to_apply: n, default: default}); } else { try!(self.must_consume("score")); try!(self.must_consume("(")); @@ -849,9 +853,9 @@ impl<'a, 'c> Parser<'a, 'c> { Err(Error::Parse("Expected key, object or array to return.".to_string())) } } else { - let mut kb = KeyBuilder::new(); - kb.push_object_key("_id"); - Ok(Box::new(RetValue{kb: kb, ag:None, default: JsonValue::Null, sort_info: None})) + let mut rp = ReturnPath::new(); + rp.push_object_key("_id".to_string()); + Ok(Box::new(RetValue{rp: rp, ag:None, default: JsonValue::Null, sort_info: None})) } } @@ -916,25 +920,24 @@ impl<'a, 'c> Parser<'a, 'c> { } } - if let Some((ag, bind_name_option, kb, json)) = try!(self.consume_aggregate()) { + if let Some((ag, bind_name_option, rp, json)) = try!(self.consume_aggregate()) { let default = if let Some(default) = try!(self.consume_default()) { default } else { JsonValue::Null }; if let Some(bind_name) = bind_name_option { - let extra_key = kb.value_key_path_only(); - Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_key: extra_key, + Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_rp: rp, ag: Some((ag, json)), default: default, sort_info:None}))) } else { - Ok(Some(Box::new(RetValue{kb: kb, ag: Some((ag, json)), + Ok(Some(Box::new(RetValue{rp: rp, ag: Some((ag, json)), default: default, sort_info:None}))) } } else if let Some(bind_name) = self.consume_field() { - let extra_key = if let Some(kb) = try!(self.consume_keypath()) { - kb.value_key_path_only() + let rp = if let Some(rp) = try!(self.consume_keypath()) { + rp } else { - "".to_string() + ReturnPath::new() }; let default = if let Some(default) = try!(self.consume_default()) { @@ -943,16 +946,16 @@ impl<'a, 'c> Parser<'a, 'c> { JsonValue::Null }; - Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_key: extra_key, + Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_rp: rp, ag: None, default: default, sort_info:None}))) - } else if let Some(kb) = try!(self.consume_keypath()) { + } else if let Some(rp) = try!(self.consume_keypath()) { let default = if let Some(default) = try!(self.consume_default()) { default } else { JsonValue::Null }; - Ok(Some(Box::new(RetValue{kb: kb, ag: None, default: default, sort_info: None}))) + Ok(Some(Box::new(RetValue{rp: rp, ag: None, default: default, sort_info: None}))) } else if self.could_consume("{") { Ok(Some(try!(self.ret_object()))) } else if self.could_consume("[") { diff --git a/src/query.rs b/src/query.rs index f8f6c99..cea4f31 100644 --- a/src/query.rs +++ b/src/query.rs @@ -9,11 +9,10 @@ use std::usize; use error::Error; use index::Index; -use key_builder::KeyBuilder; use parser::Parser; use json_value::{JsonValue}; use filters::QueryRuntimeFilter; -use returnable::{Returnable, RetValue, RetScore, RetHidden}; +use returnable::{Returnable, RetValue, RetScore, RetHidden, ReturnPath}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs use rocksdb::{DBIterator, IteratorMode, Snapshot}; @@ -175,7 +174,7 @@ impl Query { if filter.is_all_not() { return Err(Error::Parse("query cannot be made up of only logical not. Must have at least \ - match clause not negated.".to_string())); + one match clause not negated.".to_string())); } let mut ags = Vec::new(); @@ -200,8 +199,8 @@ impl Query { for (_key, sort_info) in sorts.into_iter() { let sort = sort_info.clone(); match sort_info.field { - SortField::FetchValue(kb) => { - vec.push(Box::new(RetValue{ kb: kb, + SortField::FetchValue(rp) => { + vec.push(Box::new(RetValue{ rp: rp, ag: None, default: sort_info.default, sort_info: Some(sort)})); @@ -838,7 +837,7 @@ pub enum Sort { #[derive(Clone)] pub enum SortField { - FetchValue(KeyBuilder), + FetchValue(ReturnPath), Score, } diff --git a/src/returnable.rs b/src/returnable.rs index 6873c4f..f87965a 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -13,6 +13,71 @@ use query::{AggregateFun, SortInfo}; use rocksdb::{self, DBIterator, IteratorMode}; +#[derive(Clone)] +pub enum PathSegment { + ObjectKey(String), + Array(u64), + ArrayAll, +} + +#[derive(Clone)] +pub struct ReturnPath { + path: Vec, +} + +impl ReturnPath { + pub fn new() -> ReturnPath { + ReturnPath{path: Vec::new()} + } + + pub fn push_object_key(&mut self, key: String) { + self.path.push(PathSegment::ObjectKey(key)); + } + + pub fn push_array(&mut self, index: u64) { + self.path.push(PathSegment::Array(index)); + } + + pub fn push_array_all(&mut self) { + self.path.push(PathSegment::ArrayAll); + } + + pub fn to_key(&self) -> String { + let mut key = String::new(); + for seg in self.path.iter() { + match seg { + &PathSegment::ObjectKey(ref str) => { + key.push('.'); + for cc in str.chars() { + // Escape chars that conflict with delimiters + if "\\$.".contains(cc) { + key.push('\\'); + } + key.push(cc); + } + }, + &PathSegment::Array(ref i) => { + key.push('$'); + key.push_str(&i.to_string()); + }, + &PathSegment::ArrayAll => { + key.push_str("$*"); + }, + } + } + key + } + + fn nth(&self, i: usize) -> Option<&PathSegment> { + if self.path.len() <= i { + None + } else { + Some(&self.path[i]) + } + } +} + + /// Returnables are created from parsing the return statement in queries. /// They nest inside of each other, with the outermost typically being a RetObject or RetArray. @@ -207,7 +272,7 @@ impl Returnable for RetLiteral { /// A value from a document. It knows the path it wants to fetch and loads the value from the /// stored original document. pub struct RetValue { - pub kb: KeyBuilder, + pub rp: ReturnPath, pub ag: Option<(AggregateFun, JsonValue)>, pub default: JsonValue, pub sort_info: Option, @@ -245,6 +310,79 @@ impl RetValue { .collect())) } + fn descend_return_path(iter: &mut DBIterator, seq: u64, kb: &mut KeyBuilder, + rp: &ReturnPath, mut rp_index: usize) -> Result, Error> { + + while let Some(segment) = rp.nth(rp_index) { + rp_index += 1; + match segment { + &PathSegment::ObjectKey(ref string) => { + kb.push_object_key(string); + }, + &PathSegment::ArrayAll => { + let mut i = 0; + let mut vec = Vec::new(); + loop { + kb.push_array_index(i); + i += 1; + if let Some(json) = try!(RetValue::descend_return_path(iter, seq, + &mut kb.clone(), rp, rp_index)) { + vec.push(json); + kb.pop_array(); + } else { + // we didn't get a value, is it because the array ends or the + // full path isn't there? check as there might be more array elements + // with a full path that does match. + let value_key = kb.value_key(seq); + kb.pop_array(); + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + if let Some((key, _value)) = iter.next() { + if key.starts_with(value_key.as_bytes()) { + // yes it exists. loop again. + continue; + } + } + + if vec.is_empty() { + return Ok(None); + } else { + return Ok(Some(JsonValue::Array(vec))); + } + } + } + }, + &PathSegment::Array(ref index) => { + kb.push_array_index(*index); + } + } + } + + let value_key = kb.value_key(seq); + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => { + return Ok(None) + }, + }; + + if !key.starts_with(value_key.as_bytes()) { + return Ok(None) + } + + let json_value = try!(RetValue::fetch(&mut iter.peekable(), &value_key, + key, value)); + Ok(Some(json_value)) + } + fn fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, mut key: Box<[u8]>, mut value: Box<[u8]>) -> Result { @@ -358,28 +496,13 @@ impl Returnable for RetValue { return Ok(()); } - let value_key = self.kb.value_key(seq); + let mut kb = KeyBuilder::new(); - // Seek in index to >= entry - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - - let (key, value) = match iter.next() { - Some((key, value)) => (key, value), - None => { - result.push_back(self.default.clone()); - return Ok(()) - }, - }; - - if !key.starts_with(value_key.as_bytes()) { + if let Some(json) = try!(RetValue::descend_return_path(iter, seq, &mut kb, &self.rp, 0)) { + result.push_back(json); + } else { result.push_back(self.default.clone()); - return Ok(()); } - - let json_value = try!(RetValue::fetch(&mut iter.peekable(), &value_key, - key, value)); - result.push_back(json_value); Ok(()) } @@ -388,7 +511,7 @@ impl Returnable for RetValue { } fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.sort_info = map.remove(&self.kb.value_key(0)); + self.sort_info = map.remove(&self.rp.to_key()); } fn get_sorting(&mut self, sorts: &mut Vec>) { @@ -409,38 +532,29 @@ impl Returnable for RetValue { /// original document and return it. pub struct RetBind { pub bind_name: String, - pub extra_key: String, + pub extra_rp: ReturnPath, pub ag: Option<(AggregateFun, JsonValue)>, pub default: JsonValue, pub sort_info: Option, } impl Returnable for RetBind { - fn fetch_result(&self, iter: &mut DBIterator, _seq: u64, _score: f32, + fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) -> Result<(), Error> { if let Some(value_keys) = bind_var_keys.get(&self.bind_name) { let mut array = Vec::with_capacity(value_keys.len()); for base_key in value_keys { - // Seek in index to >= entry - let value_key = base_key.to_string() + &self.extra_key; - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - - let (key, value) = match iter.next() { - Some((key, value)) => (key, value), - None => { - result.push_back(self.default.clone()); - return Ok(()) - }, - }; - - if !key.starts_with(value_key.as_bytes()) { - array.push(self.default.clone()); + let mut kb = KeyBuilder::new(); + + kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&base_key)); + + if let Some(json) = try!(RetValue::descend_return_path(iter, seq, &mut kb, + &self.extra_rp, 0)) { + array.push(json); } else { - array.push(try!(RetValue::fetch(&mut iter.peekable(), &value_key, - key, value))); + array.push(self.default.clone()); } } result.push_back(JsonValue::Array(array)); @@ -456,7 +570,7 @@ impl Returnable for RetBind { } fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.sort_info = map.remove(&(self.bind_name.to_string() + &self.extra_key)); + self.sort_info = map.remove(&(self.bind_name.to_string() + &self.extra_rp.to_key())); } fn get_sorting(&mut self, sorts: &mut Vec>) { From b22c31b911f054ba7d1526b18a1c3fd2280730db Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 13 Feb 2017 14:20:46 -0800 Subject: [PATCH 071/122] Aggregates now work on arrays Some aggregate functions now work on arrays returned instead of treating the array as a single value. max and min will continue to work as normal, but max_array and min_array will traverse array and nested arrays and use the values found inside. If they return [], it means no values were found. array_flat will extract items out of lists with the result being a single array of values. avg and sum will always search through array and nested arrays looking for numbers, instead of ignoring the array. --- repl-tests/group.noise | 93 ++++++++++++++++++++++- src/index.rs | 72 ++++++++++++++++++ src/json_shred.rs | 18 ++--- src/parser.rs | 10 ++- src/query.rs | 168 +++++++++++++++++++++++++++++++++-------- src/repl.rs | 13 +++- 6 files changed, 325 insertions(+), 49 deletions(-) diff --git a/repl-tests/group.noise b/repl-tests/group.noise index dbfc952..57a4a05 100644 --- a/repl-tests/group.noise +++ b/repl-tests/group.noise @@ -57,11 +57,11 @@ return {baz: group(.baz), concat: concat(.baz sep="|")}; ] find {foo: =="group"} -return {baz: group(.baz), list: list(.baz)}; +return {baz: group(.baz), array: array(.baz)}; [ -{"baz":"a","list":["a","a","a","a"]}, -{"baz":"b","list":["b","b","b","b"]}, -{"baz":"c","list":["c","c","c","c"]} +{"baz":"a","array":["a","a","a","a"]}, +{"baz":"b","array":["b","b","b","b"]}, +{"baz":"c","array":["c","c","c","c"]} ] find {foo: =="group"} @@ -161,3 +161,88 @@ return [group(.baz order=asc) default="a", group(.bar order=desc) default="c", c ["b","b",1], ["b","a",1] ] + +add {"_id":"1", "foo":"array", "baz": ["a","b",["c","d",["e"]]]}; +"1" +add {"_id":"2", "foo":"array", "baz": ["f","g",["h","i"],"j"]}; +"2" + +find {foo: =="array"} +return array(.baz); +[ +[["f","g",["h","i"],"j"],["a","b",["c","d",["e"]]]] +] + +find {foo: =="array"} +return array_flat(.baz); +[ +["f","g","h","i","j","a","b","c","d","e"] +] + +find {foo: =="array"} +return max(.baz); +[ +["f","g",["h","i"],"j"] +] + +find {foo: =="array"} +return max_array(.baz); +[ +"j" +] + +find {foo: =="array"} +return min_array(.baz); +[ +"a" +] + +add {"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}; +"1" +add {"_id":"2", "foo":"array", "baz": [6,7,[8,9],10]}; +"2" + + +find {foo: =="array"} +return avg(.baz); +[ +5.5 +] + +find {foo: =="array"} +return sum(.baz); +[ +55 +] + +add {"_id":"1", "foo":"array", "baz": []}; +"1" +add {"_id":"2", "foo":"array", "baz": []}; +"2" + +commit; + +find {foo: =="array"} +return avg(.baz); +[ +null +] + +find {foo: =="array"} +return sum(.baz); +[ +0 +] + +find {foo: =="array"} +return min_array(.baz); +[ +[] +] + +find {foo: =="array"} +return max_array(.baz); +[ +[] +] + diff --git a/src/index.rs b/src/index.rs index 4003456..bb0ae9b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -203,6 +203,19 @@ impl Index { Ok(status) } + pub fn all_keys(&self) -> Result, Error> { + if !self.is_open() { + return Err(Error::Write("Index isn't open.".to_string())); + } + let rocks = self.rocks.as_ref().unwrap(); + let mut results = Vec::new(); + for (key, _value) in rocks.iterator(rocksdb::IteratorMode::Start) { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + results.push(key_string); + } + Ok(results) + } + /// Should not be used generally since it not varint. Used for header fields /// since only one header is in the database it's not a problem with excess size. fn convert_bytes_to_u64(bytes: &[u8]) -> u64 { @@ -296,6 +309,8 @@ mod tests { use super::{Index, OpenOptions}; use query::Query; use std::str; + use returnable::RetValue; + use json_value::JsonValue; #[test] fn test_open() { @@ -354,4 +369,61 @@ mod tests { assert!(key.starts_with(&b"HDB"[..])); assert!(iter.next().is_none()); } + + #[test] + fn test_updates() { + let dbname = "target/tests/testupdates"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + //let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": ["a","b",["c","d",["e"]]]}"#).unwrap(); + + //index.flush().unwrap(); + + let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}"#).unwrap(); + + index.flush().unwrap(); + { + let rocks = index.rocks.as_mut().unwrap(); + + let mut results = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'V' { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + results.push((key_string, RetValue::bytes_to_json_value(&*value))); + } + } + + let expected = vec![ + ("V1#._id".to_string(), JsonValue::String("1".to_string())), + ("V1#.baz$0".to_string(), JsonValue::Number(1.0)), + ("V1#.baz$1".to_string(), JsonValue::Number(2.0)), + ("V1#.baz$2$0".to_string(), JsonValue::Number(3.0)), + ("V1#.baz$2$1".to_string(), JsonValue::Number(4.0)), + ("V1#.baz$2$2$0".to_string(), JsonValue::Number(5.0)), + ("V1#.foo".to_string(), JsonValue::String("array".to_string()))]; + assert_eq!(results, expected); + } + + let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": []}"#).unwrap(); + index.flush().unwrap(); + + let rocks = index.rocks.as_mut().unwrap(); + + let mut results = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'V' { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + results.push((key_string, RetValue::bytes_to_json_value(&*value))); + } + } + let expected = vec![ + ("V1#._id".to_string(), JsonValue::String("1".to_string())), + ("V1#.baz".to_string(), JsonValue::Array(vec![])), + ("V1#.foo".to_string(), JsonValue::String("array".to_string())) + ]; + assert_eq!(results, expected); + } } diff --git a/src/json_shred.rs b/src/json_shred.rs index 879464a..ce47b3d 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -109,13 +109,6 @@ impl Shredder { try!(batch.put(&key.into_bytes(), &buffer.as_bytes())); } - let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); - if delete { - try!(batch.delete(&key.into_bytes())); - } else { - try!(batch.put(&key.into_bytes(), &docseq.to_string().as_bytes())); - } - Ok(()) } @@ -217,6 +210,9 @@ impl Shredder { try!(batch.put(&key.into_bytes(), &value.as_ref())); } } + let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); + try!(batch.put(&key.into_bytes(), &seq.to_string().as_bytes())); + Ok(()) } @@ -236,6 +232,8 @@ impl Shredder { try!(batch.delete(&key.into_bytes())); } } + let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); + try!(batch.delete(&key.into_bytes())); Ok(()) } @@ -467,7 +465,6 @@ mod tests { rocks.write(batch).unwrap(); let result = positions_from_rocks(&rocks); - println!("result: {:?}", result); let expected = vec![ ("W.A$.B!b1#1234,1".to_string(), vec![0]), ("W.A$.B!b2vmx#1234,0".to_string(), vec![0]), @@ -488,6 +485,7 @@ mod tests { let docseq = 123; let mut batch = rocksdb::WriteBatch::default(); shredder.shred(json).unwrap(); + shredder.add_id("foo").unwrap(); shredder.add_all_to_batch(docseq, &mut batch).unwrap(); let dbname = "target/tests/test_shred_empty_object"; @@ -500,7 +498,7 @@ mod tests { rocks.write(batch).unwrap(); let result = positions_from_rocks(&rocks); - - assert!(result.is_empty()); + let expected = vec![("W._id!foo#123,".to_string(), vec![0])]; + assert_eq!(result, expected); } } diff --git a/src/parser.rs b/src/parser.rs index 0cbda8a..6020daf 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -153,12 +153,18 @@ impl<'a, 'c> Parser<'a, 'c> { AggregateFun::GroupAsc } else if self.consume("sum") { AggregateFun::Sum + } else if self.consume("max_array") { + AggregateFun::MaxArray } else if self.consume("max") { AggregateFun::Max + } else if self.consume("min_array") { + AggregateFun::MinArray } else if self.consume("min") { AggregateFun::Min - } else if self.consume("list") { - AggregateFun::List + } else if self.consume("array_flat") { + AggregateFun::ArrayFlat + } else if self.consume("array") { + AggregateFun::Array } else if self.consume("concat") { AggregateFun::Concat } else if self.consume("avg") { diff --git a/src/query.rs b/src/query.rs index cea4f31..302d9ec 100644 --- a/src/query.rs +++ b/src/query.rs @@ -328,7 +328,7 @@ pub struct QueryResults<'a> { done_with_sorting_and_ags: bool, does_group_or_aggr: bool, sorts: Option>, - aggr_inits: Vec<(fn (&mut JsonValue), usize)>, + aggr_inits: Vec<(fn (JsonValue) -> JsonValue, usize)>, aggr_actions: Vec<(fn (&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, aggr_finals: Vec<(fn (&mut JsonValue), usize)>, in_buffer: Vec>, @@ -564,7 +564,12 @@ impl<'a> QueryResults<'a> { match QueryResults::cmp_results(&sorts, &old, &new) { Ordering::Less => { for &(ref init, n) in self.aggr_inits.iter() { - (init)(&mut new[n]); + // we can't swap out a value of new directly, so this lets us + // without shifting or cloning values, both of which can be + // expensive + let mut new_n = JsonValue::Null; + swap(&mut new_n, &mut new[n]); + new[n] = (init)(new_n); } //push back old value into sorted_buffer, //then use new value as old value. @@ -611,7 +616,12 @@ impl<'a> QueryResults<'a> { }, (None, Some(mut new)) => { for &(ref init, n) in self.aggr_inits.iter() { - (init)(&mut new[n]); + // we can't swap out a value of new directly, so this lets us + // without shifting or cloning values, both of which can be + // expensive + let mut new_n = JsonValue::Null; + swap(&mut new_n, &mut new[n]); + new[n] = (init)(new_n); } option_old = Some(new); option_new = self.in_buffer.pop(); @@ -646,15 +656,18 @@ pub enum AggregateFun { GroupDesc, Sum, Max, + MaxArray, Min, - List, + MinArray, + Array, + ArrayFlat, Concat, Avg, Count, } struct AggregateFunImpls { - init: Option, + init: Option JsonValue>, action: fn (&mut JsonValue, JsonValue, &JsonValue), extract: Option, } @@ -679,9 +692,24 @@ impl AggregateFun { action: AggregateFun::min, extract: None, }, - &AggregateFun::List => AggregateFunImpls{ - init: Some(AggregateFun::list_init), - action: AggregateFun::list, + &AggregateFun::MaxArray => AggregateFunImpls{ + init: Some(AggregateFun::max_array_init), + action: AggregateFun::max_array, + extract: None, + }, + &AggregateFun::MinArray => AggregateFunImpls{ + init: Some(AggregateFun::min_array_init), + action: AggregateFun::min_array, + extract: None, + }, + &AggregateFun::Array => AggregateFunImpls{ + init: Some(AggregateFun::array_init), + action: AggregateFun::array, + extract: None, + }, + &AggregateFun::ArrayFlat => AggregateFunImpls{ + init: Some(AggregateFun::array_flat_init), + action: AggregateFun::array_flat, extract: None, }, &AggregateFun::Concat => AggregateFunImpls{ @@ -702,18 +730,23 @@ impl AggregateFun { } } - fn sum_init(existing: &mut JsonValue) { - if let &mut JsonValue::Number(_) = existing { - //do nothing - } else { - *existing = JsonValue::Number(0.0) - } + fn sum_init(existing: JsonValue) -> JsonValue { + let mut base = JsonValue::Number(0.0); + AggregateFun::sum(&mut base, existing, &JsonValue::Null); + base } - fn sum(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - match (existing, new) { - (&mut JsonValue::Number(ref mut existing), JsonValue::Number(new)) => { - *existing += new; + fn sum(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + match new { + JsonValue::Number(new) => { + if let &mut JsonValue::Number(ref mut existing) = existing { + *existing += new; + } + }, + JsonValue::Array(vec) => { + for v in vec { + AggregateFun::sum(existing, v, _user_arg); + } }, _ => (), } @@ -731,21 +764,85 @@ impl AggregateFun { } } - fn list_init(existing: &mut JsonValue) { - *existing = JsonValue::Array(vec![existing.clone()]); + fn max_array_init(existing: JsonValue) -> JsonValue { + // The default value is an array, which can never be a value because arrays are always + // traversed. It's possible we never encounter a value due to only encountering empty + // arrays, in which case the final value is an empty array meaning no values encountered. + let mut val = JsonValue::Array(vec![]); + AggregateFun::max_array(&mut val, existing, &JsonValue::Null); + val + } + + fn max_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec { + AggregateFun::max_array(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(_) = existing { + *existing = new; + } else if (*existing).cmp(&new) == Ordering::Less { + *existing = new; + } + } } - fn list(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + fn min_array_init(existing: JsonValue) -> JsonValue { + // The default value is an array, which can never be a value because arrays are always + // traversed. It's possible we never encounter a value due to only encountering empty + // arrays, in which case the final value is an empty array meaning no values encountered. + let mut val = JsonValue::Array(vec![]); + AggregateFun::min_array(&mut val, existing, &JsonValue::Null); + val + } + + fn min_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec { + AggregateFun::min_array(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(_) = existing { + *existing = new; + } else if (*existing).cmp(&new) == Ordering::Greater { + *existing = new; + } + } + } + + fn array_init(existing: JsonValue) -> JsonValue { + JsonValue::Array(vec![existing]) + } + + fn array(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { if let &mut JsonValue::Array(ref mut existing) = existing { existing.push(new); } } - fn concat_init(existing: &mut JsonValue) { - if let &mut JsonValue::String(ref _string) = existing { - // do nothing + fn array_flat_init(existing: JsonValue) -> JsonValue { + let mut new = JsonValue::Array(vec![]); + AggregateFun::array_flat(&mut new, existing, &JsonValue::Null); + new + } + + fn array_flat(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec.into_iter() { + AggregateFun::array_flat(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(ref mut existing) = existing { + existing.push(new); + } + } + } + + fn concat_init(existing: JsonValue) -> JsonValue { + if let JsonValue::String(_) = existing { + existing } else { - JsonValue::String(String::new()); + JsonValue::String(String::new()) } } @@ -760,13 +857,16 @@ impl AggregateFun { } } - fn avg_init(existing: &mut JsonValue) { - let new = if let &mut JsonValue::Number(ref num) = existing { - JsonValue::Array(vec![JsonValue::Number(num.clone()), JsonValue::Number(1.0)]) + fn avg_init(existing: JsonValue) -> JsonValue { + if let JsonValue::Number(_) = existing { + JsonValue::Array(vec![existing, JsonValue::Number(1.0)]) + } else if let JsonValue::Array(_) = existing { + let mut avg = JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]); + AggregateFun::avg(&mut avg, existing, &JsonValue::Null); + avg } else { JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]) - }; - *existing = new; + } } fn avg(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { @@ -791,6 +891,10 @@ impl AggregateFun { array[0] = JsonValue::Number(avg); array[1] = JsonValue::Number(count); } + } else if let JsonValue::Array(vec) = new { + for v in vec.into_iter() { + AggregateFun::avg(existing, v, _user_arg); + } } } @@ -818,8 +922,8 @@ impl AggregateFun { *existing = json } - fn count_init(existing: &mut JsonValue) { - *existing = JsonValue::Number(1.0); + fn count_init(_existing: JsonValue) -> JsonValue { + JsonValue::Number(1.0) } fn count(existing: &mut JsonValue, _: JsonValue, _user_arg: &JsonValue) { diff --git a/src/repl.rs b/src/repl.rs index 7885c82..b7850b5 100644 --- a/src/repl.rs +++ b/src/repl.rs @@ -7,7 +7,7 @@ use std::io::{Write, BufRead}; fn is_command(str: &str) -> bool { let commands = ["find", "add", "create", "drop", "open", - "pretty", "commit", "del", "load"]; + "pretty", "commit", "del", "load", "dumpkeys"]; for command in commands.iter() { if str.starts_with(command) { return true; @@ -92,6 +92,17 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { Ok(()) => (), Err(reason) => write!(w, "{}\n", reason).unwrap(), } + } else if lines.starts_with("dumpkeys") { + match index.all_keys() { + Ok(keys) => { + for key in keys { + write!(w, "{}\n", key).unwrap(); + } + }, + Err(reason) => { + write!(w, "{}\n", reason).unwrap(); + }, + } } else if lines.starts_with("add") { match index.add(&lines[3..]) { Ok(id) => write!(w, "{}\n", JsonValue::str_to_literal(&id)).unwrap(), From c6dc65e51ec5ed26cd6b7e57226a8b460517caee Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 17 Feb 2017 10:53:55 -0800 Subject: [PATCH 072/122] Abstract out details of rocksdb snapshot/iterators from filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the details of walking to the stemmed word index out of the Filters to into new term_index module. Discovered that it’s unnecessary to have all the Result return types as the RocksDB DBIterator interfaces don’t return errors, so removed that from the QueryRuntimeFilter trait. --- src/filters.rs | 280 +++++++++++++++++++--------------------------- src/index.rs | 2 +- src/lib.rs | 1 + src/query.rs | 67 +++++------ src/repl.rs | 17 +-- src/returnable.rs | 113 +++++++++---------- src/term_index.rs | 64 +++++++++++ 7 files changed, 266 insertions(+), 278 deletions(-) create mode 100644 src/term_index.rs diff --git a/src/filters.rs b/src/filters.rs index 2a806c7..07625b1 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -1,22 +1,19 @@ -extern crate varint; - use std::str; use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::HashSet; use index::Index; use std::f32; -use std::io::Cursor; use error::Error; use key_builder::KeyBuilder; use query::{DocResult, QueryScoringInfo}; use returnable::RetValue; use json_value::JsonValue; +use term_index::{DocResultIterator}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs use rocksdb::{self, DBIterator, Snapshot, IteratorMode}; -use self::varint::VarintRead; struct Scorer { iter: DBIterator, @@ -95,8 +92,8 @@ impl Scorer { } pub trait QueryRuntimeFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error>; - fn next_result(&mut self) -> Result, Error>; + fn first_result(&mut self, start: &DocResult) -> Option; + fn next_result(&mut self) -> Option; fn prepare_relevancy_scoring(&mut self, qsi: &mut QueryScoringInfo); /// returns error is a double negation is detected @@ -108,8 +105,7 @@ pub trait QueryRuntimeFilter { pub struct StemmedWordFilter { - iter: DBIterator, - keypathword: String, + iter: DocResultIterator, scorer: Scorer, } @@ -117,51 +113,28 @@ impl StemmedWordFilter { pub fn new(snapshot: &Snapshot, stemmed_word: &str, kb: &KeyBuilder, boost: f32) -> StemmedWordFilter { StemmedWordFilter { - iter: snapshot.iterator(IteratorMode::Start), - keypathword: kb.get_keypathword_only(&stemmed_word), + iter: DocResultIterator::new(snapshot, stemmed_word, kb), scorer: Scorer::new(snapshot.iterator(IteratorMode::Start), stemmed_word, kb, boost), } } } impl QueryRuntimeFilter for StemmedWordFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); - // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); - - KeyBuilder::truncate_to_keypathword(&mut self.keypathword); - + fn first_result(&mut self, start: &DocResult) -> Option { + self.iter.advance_gte(start); self.next_result() } - fn next_result(&mut self) -> Result, Error> { - let (key, value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return Ok(None), - }; - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return Ok(None) - } - - // We have a candidate document to return - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); - - if self.scorer.should_score() { - let mut vec = Vec::with_capacity(value.len()); - vec.extend(value.into_iter()); - let mut bytes = Cursor::new(vec); - let mut count = 0; - while let Ok(_pos) = bytes.read_unsigned_varint_32() { - count += 1; + fn next_result(&mut self) -> Option { + if let Some((mut dr, pos)) = self.iter.next() { + if self.scorer.should_score() { + let count = pos.positions().len(); + self.scorer.add_match_score(count as u32, &mut dr); } - self.scorer.add_match_score(count, &mut dr); + Some(dr) + } else { + None } - - Ok(Some(dr)) } fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { @@ -180,8 +153,7 @@ impl QueryRuntimeFilter for StemmedWordFilter { /// This is not a QueryRuntimeFilter but it imitates one. Instead of returning just a DocResult /// it also return a vector of word positions, each being a instance of the word occurance pub struct StemmedWordPosFilter { - iter: DBIterator, - keypathword: String, + iter: DocResultIterator, scorer: Scorer, } @@ -189,50 +161,28 @@ impl StemmedWordPosFilter { pub fn new(snapshot: &Snapshot, stemmed_word: &str, kb: &KeyBuilder, boost: f32) -> StemmedWordPosFilter { StemmedWordPosFilter{ - iter: snapshot.iterator(IteratorMode::Start), - keypathword: kb.get_keypathword_only(&stemmed_word), + iter: DocResultIterator::new(snapshot, stemmed_word, kb), scorer: Scorer::new(snapshot.iterator(IteratorMode::Start), &stemmed_word, &kb, boost), } } - fn first_result(&mut self, - start: &DocResult) -> Result)>, Error> { - - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); - // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); - - KeyBuilder::truncate_to_keypathword(&mut self.keypathword); - + fn first_result(&mut self, start: &DocResult) -> Option<(DocResult, Vec)> { + self.iter.advance_gte(start); self.next_result() } - fn next_result(&mut self) -> Result)>, Error> { - let (key, value) = match self.iter.next() { - Some((key, value)) => (key, value), - None => return Ok(None), - }; - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return Ok(None) - } - - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); - - let mut vec = Vec::with_capacity(value.len()); - vec.extend(value.into_iter()); - let mut bytes = Cursor::new(vec); - let mut positions = Vec::new(); - while let Ok(pos) = bytes.read_unsigned_varint_32() { - positions.push(pos); + fn next_result(&mut self) -> Option<(DocResult, Vec)> { + if let Some((mut dr, pos)) = self.iter.next() { + let positions = pos.positions(); + if self.scorer.should_score() { + let count = positions.len(); + self.scorer.add_match_score(count as u32, &mut dr); + } + Some((dr, positions)) + } else { + None } - - self.scorer.add_match_score(positions.len() as u32, &mut dr); - - Ok(Some((dr, positions))) } fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { @@ -252,16 +202,15 @@ impl StemmedPhraseFilter { } } - fn result(&mut self, - base: Option<(DocResult, Vec)>) -> Result, Error> { + fn result(&mut self, base: Option<(DocResult, Vec)>) -> Option { // this is the number of matches left before all terms match and we can return a result let mut matches_left = self.filters.len() - 1; - if base.is_none() { return Ok(None); } + if base.is_none() { return None; } let (mut base_result, mut base_positions) = base.unwrap(); if matches_left == 0 { - return Ok(Some(base_result)); + return Some(base_result); } let mut current_filter = 0; @@ -271,9 +220,9 @@ impl StemmedPhraseFilter { current_filter = 0; } - let next = try!(self.filters[current_filter].first_result(&base_result)); + let next = self.filters[current_filter].first_result(&base_result); - if next.is_none() { return Ok(None); } + if next.is_none() { return None; } let (next_result, next_positions) = next.unwrap(); if base_result == next_result { @@ -289,13 +238,13 @@ impl StemmedPhraseFilter { matches_left -= 1; if matches_left == 0 { - return Ok(Some(base_result)); + return Some(base_result); } } else { // we didn't match on phrase, so get next_result from first filter current_filter = 0; - let next = try!(self.filters[current_filter].next_result()); - if next.is_none() { return Ok(None); } + let next = self.filters[current_filter].next_result(); + if next.is_none() { return None; } let (next_result, next_positions) = next.unwrap(); base_result = next_result; base_positions = next_positions; @@ -306,8 +255,8 @@ impl StemmedPhraseFilter { // we didn't match on next_result, so get first_result at next_result on // 1st filter. current_filter = 0; - let next = try!(self.filters[current_filter].first_result(&next_result)); - if next.is_none() { return Ok(None); } + let next = self.filters[current_filter].first_result(&next_result); + if next.is_none() { return None; } let (next_result, next_positions) = next.unwrap(); base_result = next_result; base_positions = next_positions; @@ -320,13 +269,13 @@ impl StemmedPhraseFilter { impl QueryRuntimeFilter for StemmedPhraseFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - let base_result = try!(self.filters[0].first_result(start)); + fn first_result(&mut self, start: &DocResult) -> Option { + let base_result = self.filters[0].first_result(start); self.result(base_result) } - fn next_result(&mut self) -> Result, Error> { - let base_result = try!(self.filters[0].next_result()); + fn next_result(&mut self) -> Option { + let base_result = self.filters[0].next_result(); self.result(base_result) } @@ -366,7 +315,7 @@ impl ExactMatchFilter { } } - fn check_exact(&mut self, mut dr: DocResult) -> Result, Error> { + fn check_exact(&mut self, mut dr: DocResult) -> Option { loop { let value_key = self.kb.value_key_from_doc_result(&dr); @@ -382,13 +331,13 @@ impl ExactMatchFilter { self.phrase == string.to_lowercase() }; if matches { - return Ok(Some(dr)); + return Some(dr); } else { - if let Some(next) = try!(self.filter.next_result()) { + if let Some(next) = self.filter.next_result() { dr = next; // continue looping } else { - return Ok(None); + return None; } } } else { @@ -402,19 +351,19 @@ impl ExactMatchFilter { } impl QueryRuntimeFilter for ExactMatchFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - if let Some(dr) = try!(self.filter.first_result(start)) { + fn first_result(&mut self, start: &DocResult) -> Option { + if let Some(dr) = self.filter.first_result(start) { self.check_exact(dr) } else { - Ok(None) + None } } - fn next_result(&mut self) -> Result, Error> { - if let Some(dr) = try!(self.filter.next_result()) { + fn next_result(&mut self) -> Option { + if let Some(dr) = self.filter.next_result() { self.check_exact(dr) } else { - Ok(None) + None } } @@ -446,14 +395,13 @@ impl DistanceFilter { } } - fn result(&mut self, - base: Option<(DocResult, Vec)>) -> Result, Error> { + fn result(&mut self, base: Option<(DocResult, Vec)>) -> Option { // yes this code complex. I tried to break it up, but it wants to be like this. // this is the number of matches left before all terms match and we can return a result let mut matches_left = self.filters.len() - 1; - if base.is_none() { return Ok(None); } + if base.is_none() { return None; } let (mut base_result, positions) = base.unwrap(); // This contains tuples of word postions and the filter they came from, @@ -471,9 +419,9 @@ impl DistanceFilter { self.current_filter = 0; } - let next = try!(self.filters[self.current_filter].first_result(&base_result)); + let next = self.filters[self.current_filter].first_result(&base_result); - if next.is_none() { return Ok(None); } + if next.is_none() { return None; } let (next_result, next_positions) = next.unwrap(); if base_result != next_result { @@ -530,15 +478,15 @@ impl DistanceFilter { matches_left -= 1; if matches_left == 0 { - return Ok(Some(base_result)); + return Some(base_result); } else { continue; } } // we didn't match on next_result, so get next_result on current filter - let next = try!(self.filters[self.current_filter].next_result()); + let next = self.filters[self.current_filter].next_result(); - if next.is_none() { return Ok(None); } + if next.is_none() { return None; } let (next_result, next_positions) = next.unwrap(); base_result = next_result; base_positions = next_positions.iter() @@ -551,13 +499,13 @@ impl DistanceFilter { } impl QueryRuntimeFilter for DistanceFilter { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].first_result(start)); + fn first_result(&mut self, start: &DocResult) -> Option { + let base_result = self.filters[self.current_filter].first_result(start); self.result(base_result) } - fn next_result(&mut self) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].next_result()); + fn next_result(&mut self) -> Option { + let base_result = self.filters[self.current_filter].next_result(); self.result(base_result) } @@ -592,10 +540,10 @@ impl<'a> AndFilter<'a> { } } - fn result(&mut self, base: Option) -> Result, Error> { + fn result(&mut self, base: Option) -> Option { let mut matches_count = self.filters.len() - 1; - if base.is_none() { return Ok(None); } + if base.is_none() { return None; } let mut base_result = base.unwrap(); base_result.arraypath.resize(self.array_depth, 0); @@ -606,9 +554,9 @@ impl<'a> AndFilter<'a> { self.current_filter = 0; } - let next = try!(self.filters[self.current_filter].first_result(&base_result)); + let next = self.filters[self.current_filter].first_result(&base_result); - if next.is_none() { return Ok(None); } + if next.is_none() { return None; } let mut next_result = next.unwrap(); next_result.arraypath.resize(self.array_depth, 0); @@ -617,7 +565,7 @@ impl<'a> AndFilter<'a> { matches_count -= 1; base_result.combine(&mut next_result); if matches_count == 0 { - return Ok(Some(base_result)); + return Some(base_result); } } else { base_result = next_result; @@ -628,13 +576,13 @@ impl<'a> AndFilter<'a> { } impl<'a> QueryRuntimeFilter for AndFilter<'a> { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].first_result(start)); + fn first_result(&mut self, start: &DocResult) -> Option { + let base_result = self.filters[self.current_filter].first_result(start); self.result(base_result) } - fn next_result(&mut self) -> Result, Error> { - let base_result = try!(self.filters[self.current_filter].next_result()); + fn next_result(&mut self) -> Option { + let base_result = self.filters[self.current_filter].next_result(); self.result(base_result) } @@ -672,36 +620,34 @@ pub struct FilterWithResult<'a> { } impl<'a> FilterWithResult<'a> { - fn prime_first_result(&mut self, start: &DocResult) -> Result<(), Error> { + fn prime_first_result(&mut self, start: &DocResult) { if self.is_done { - return Ok(()) + return; } if self.result.is_none() { - self.result = try!(self.filter.first_result(start)); + self.result = self.filter.first_result(start); } else if self.result.as_ref().unwrap().less(start, self.array_depth) { - self.result = try!(self.filter.first_result(start)); + self.result = self.filter.first_result(start); } if self.result.is_none() { self.is_done = true; } else { self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); } - Ok(()) } - fn prime_next_result(&mut self) -> Result<(), Error> { + fn prime_next_result(&mut self) { if self.is_done { - return Ok(()) + return; } if self.result.is_none() { - self.result = try!(self.filter.next_result()); + self.result = self.filter.next_result(); } if self.result.is_none() { self.is_done = true; } else { self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); } - Ok(()) } } @@ -768,16 +714,16 @@ impl<'a> OrFilter<'a> { } impl<'a> QueryRuntimeFilter for OrFilter<'a> { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - try!(self.left.prime_first_result(start)); - try!(self.right.prime_first_result(start)); - Ok(self.take_smallest()) + fn first_result(&mut self, start: &DocResult) -> Option { + self.left.prime_first_result(start); + self.right.prime_first_result(start); + self.take_smallest() } - fn next_result(&mut self) -> Result, Error> { - try!(self.left.prime_next_result()); - try!(self.right.prime_next_result()); - Ok(self.take_smallest()) + fn next_result(&mut self) -> Option { + self.left.prime_next_result(); + self.right.prime_next_result(); + self.take_smallest() } fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { @@ -818,26 +764,26 @@ impl<'a> NotFilter<'a> { } impl<'a> QueryRuntimeFilter for NotFilter<'a> { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { + fn first_result(&mut self, start: &DocResult) -> Option { let mut start = start.clone_only_seq_and_arraypath(); - while let Some(dr) = try!(self.filter.first_result(&start)) { + while let Some(dr) = self.filter.first_result(&start) { if start.less(&dr, self.array_depth) { self.last_doc_returned = Some(start.clone_only_seq_and_arraypath()); - return Ok(Some(start.clone_only_seq_and_arraypath())); + return Some(start.clone_only_seq_and_arraypath()); } start.increment_last(self.array_depth); } self.last_doc_returned = None; - Ok(Some(start)) + Some(start) } - fn next_result(&mut self) -> Result, Error> { + fn next_result(&mut self) -> Option { let next = if let Some(ref last_doc_returned) = self.last_doc_returned { let mut next = last_doc_returned.clone_only_seq_and_arraypath(); next.increment_last(self.array_depth); next } else { - return Ok(None); + return None; }; self.first_result(&next) } @@ -882,53 +828,53 @@ impl<'a> BindFilter<'a> { } } - fn collect_results(&mut self, mut first: DocResult) -> Result, Error> { + fn collect_results(&mut self, mut first: DocResult) -> Option { let value_key = self.kb.value_key_from_doc_result(&first); first.add_bind_name_result(&self.bind_var_name, value_key); - while let Some(next) = try!(self.filter.next_result()) { + while let Some(next) = self.filter.next_result() { if next.seq == first.seq { let value_key = self.kb.value_key_from_doc_result(&next); first.add_bind_name_result(&self.bind_var_name, value_key); } else { self.option_next = Some(next); - return Ok(Some(first)); + return Some(first); } } - Ok(Some(first)) + Some(first) } } impl<'a> QueryRuntimeFilter for BindFilter<'a> { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { + fn first_result(&mut self, start: &DocResult) -> Option { let first = if let Some(next) = self.option_next.take() { if start.less(&next, self.array_depth) { Some(next) } else { - try!(self.filter.first_result(&start)) + self.filter.first_result(&start) } } else { - try!(self.filter.first_result(&start)) + self.filter.first_result(&start) }; if let Some(first) = first { self.collect_results(first) } else { - Ok(None) + None } } - fn next_result(&mut self) -> Result, Error> { + fn next_result(&mut self) -> Option { let first = if let Some(next) = self.option_next.take() { Some(next) } else { - try!(self.filter.next_result()) + self.filter.next_result() }; if let Some(first) = first { self.collect_results(first) } else { - Ok(None) + None } } @@ -960,21 +906,21 @@ impl<'a> BoostFilter<'a> { } impl<'a> QueryRuntimeFilter for BoostFilter<'a> { - fn first_result(&mut self, start: &DocResult) -> Result, Error> { - if let Some(mut dr) = try!(self.filter.first_result(&start)) { + fn first_result(&mut self, start: &DocResult) -> Option { + if let Some(mut dr) = self.filter.first_result(&start) { dr.boost_scores(self.boost); - Ok(Some(dr)) + Some(dr) } else { - Ok(None) + None } } - fn next_result(&mut self) -> Result, Error> { - if let Some(mut dr) = try!(self.filter.next_result()) { + fn next_result(&mut self) -> Option { + if let Some(mut dr) = self.filter.next_result() { dr.boost_scores(self.boost); - Ok(Some(dr)) + Some(dr) } else { - Ok(None) + None } } diff --git a/src/index.rs b/src/index.rs index bb0ae9b..96410af 100644 --- a/src/index.rs +++ b/src/index.rs @@ -336,7 +336,7 @@ mod tests { index.flush().unwrap(); let mut results = Query::get_matches(r#"find {foo:=="bar"}"#, &index).unwrap(); - let query_id = results.get_next_id().unwrap().unwrap(); + let query_id = results.get_next_id().unwrap(); assert!(query_id.len() == 32); assert_eq!(query_id, id); } diff --git a/src/lib.rs b/src/lib.rs index a6709d0..655e47e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ mod json_shred; mod key_builder; mod parser; mod stems; +mod term_index; mod returnable; pub mod repl; pub mod json_value; diff --git a/src/query.rs b/src/query.rs index 302d9ec..60b0258 100644 --- a/src/query.rs +++ b/src/query.rs @@ -356,57 +356,57 @@ impl<'a> QueryResults<'a> { / (self.scoring_num_terms as f32) } - fn get_next_result(&mut self) -> Result, Error> { + fn get_next_result(&mut self) -> Option { if self.done_with_sorting_and_ags { - return Ok(None); + return None; } - let result = try!(self.filter.first_result(&self.doc_result_next)); + let result = self.filter.first_result(&self.doc_result_next); match result { Some(doc_result) => { self.doc_result_next.seq = doc_result.seq + 1; - Ok(Some(doc_result)) + Some(doc_result) }, - None => Ok(None), + None => None, } } - fn get_next(&mut self) -> Result, Error> { - if let Some(doc_result) = try!(self.get_next_result()) { - Ok(Some(doc_result.seq)) + fn get_next(&mut self) -> Option { + if let Some(doc_result) = self.get_next_result() { + Some(doc_result.seq) } else { - Ok(None) + None } } - pub fn get_next_id(&mut self) -> Result, Error> { - let seq = try!(self.get_next()); + pub fn get_next_id(&mut self) -> Option { + let seq = self.get_next(); match seq { Some(seq) => { let key = format!("V{}#._id", seq); - match try!(self.snapshot.get(&key.as_bytes())) { + match self.snapshot.get(&key.as_bytes()).unwrap() { // If there is an id, it's UTF-8. Strip off type leading byte - Some(id) => Ok(Some(id.to_utf8().unwrap()[1..].to_string())), - None => Ok(None) + Some(id) => Some(id.to_utf8().unwrap()[1..].to_string()), + None => None } }, - None => Ok(None), + None => None, } } - pub fn next_result(&mut self) -> Result, Error> { + pub fn next_result(&mut self) -> Option { if self.needs_sorting_and_ags { loop { let next = if self.done_with_sorting_and_ags { None } else { - try!(self.get_next_result()) + self.get_next_result() }; match next { Some(dr) => { let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - try!(self.returnable.fetch_result(&mut self.iter, dr.seq, score, - &dr.bind_name_result, &mut results)); + self.returnable.fetch_result(&mut self.iter, dr.seq, score, + &dr.bind_name_result, &mut results); self.in_buffer.push(results); if self.in_buffer.len() == self.limit { self.do_sorting_and_ags(); @@ -426,23 +426,23 @@ impl<'a> QueryResults<'a> { } } if let Some(mut results) = self.sorted_buffer.pop() { - return Ok(Some(try!(self.returnable.json_result(&mut results)))); + return Some(self.returnable.json_result(&mut results)); } else { - return Ok(None); + return None; } }, } } } else { - let dr = match try!(self.get_next_result()) { + let dr = match self.get_next_result() { Some(dr) => dr, - None => return Ok(None), + None => return None, }; let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - try!(self.returnable.fetch_result(&mut self.iter, dr.seq, score, - &dr.bind_name_result, &mut results)); - Ok(Some(try!(self.returnable.json_result(&mut results)))) + self.returnable.fetch_result(&mut self.iter, dr.seq, score, + &dr.bind_name_result, &mut results); + Some(self.returnable.json_result(&mut results)) } } @@ -638,14 +638,10 @@ impl<'a> QueryResults<'a> { } impl<'a> Iterator for QueryResults<'a> { - type Item = Result; + type Item = JsonValue; - fn next(&mut self) -> Option> { - match self.next_result() { - Ok(Some(json)) => Some(Ok(json)), - Ok(None) => None, - Err(reason) => Some(Err(reason)), - } + fn next(&mut self) -> Option { + self.next_result() } } @@ -995,9 +991,8 @@ mod tests { let mut query_results = Query::get_matches(r#"find {data: == "u"}"#, &index).unwrap(); loop { match query_results.get_next_id() { - Ok(Some(result)) => println!("result: {}", result), - Ok(None) => break, - Err(error) => panic!(error), + Some(result) => println!("result: {}", result), + None => break, } } } diff --git a/src/repl.rs b/src/repl.rs index b7850b5..031fe9f 100644 --- a/src/repl.rs +++ b/src/repl.rs @@ -131,19 +131,12 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { w.write_all(b"\n").unwrap(); } pretty.push(); - while let Some(result) = results.next() { - match result { - Ok(json) => { - json.render(w, &mut pretty).unwrap(); - if results.peek().is_some() { - w.write_all(b",").unwrap(); - } - w.write_all(b"\n").unwrap(); - }, - Err(reason) => { - write!(w, "{}\n", reason).unwrap(); - }, + while let Some(json) = results.next() { + json.render(w, &mut pretty).unwrap(); + if results.peek().is_some() { + w.write_all(b",").unwrap(); } + w.write_all(b"\n").unwrap(); } w.write_all(b"]\n").unwrap(); }, diff --git a/src/returnable.rs b/src/returnable.rs index f87965a..b2d42e5 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -6,7 +6,6 @@ use std::mem::transmute; use std::collections::VecDeque; use std::iter::Iterator; -use error::Error; use key_builder::{KeyBuilder, Segment}; use json_value::{JsonValue}; use query::{AggregateFun, SortInfo}; @@ -88,7 +87,7 @@ pub trait Returnable { /// VecDeque. fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error>; + result: &mut VecDeque); /// If aggregates are used each Returnable needs to return information about the /// aggregate function it's using and the default value. @@ -109,7 +108,7 @@ pub trait Returnable { /// This is the final step of a Returnable. The previous fetched JsonValues are now /// rendered with other ornamental json elements. - fn json_result(&self, results: &mut VecDeque) -> Result; + fn json_result(&self, results: &mut VecDeque) -> JsonValue; } /// A static Json Object the can contain another number of fields and nested returnables. @@ -120,11 +119,10 @@ pub struct RetObject { impl Returnable for RetObject { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { + result: &mut VecDeque) { for &(ref _key, ref field) in self.fields.iter() { - try!(field.fetch_result(iter, seq, score, bind_var_keys, result)); + field.fetch_result(iter, seq, score, bind_var_keys, result); } - Ok(()) } fn get_aggregate_funs(&self, funs: &mut Vec>) { @@ -145,12 +143,12 @@ impl Returnable for RetObject { } } - fn json_result(&self, results: &mut VecDeque) -> Result { + fn json_result(&self, results: &mut VecDeque) -> JsonValue { let mut vec = Vec::with_capacity(self.fields.len()); for &(ref key, ref returnable) in self.fields.iter() { - vec.push((key.clone(), try!(returnable.json_result(results)))); + vec.push((key.clone(), returnable.json_result(results))); } - Ok(JsonValue::Object(vec)) + JsonValue::Object(vec) } } @@ -162,11 +160,10 @@ pub struct RetArray { impl Returnable for RetArray { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { + result: &mut VecDeque) { for ref slot in self.slots.iter() { - try!(slot.fetch_result(iter, seq, score, bind_var_keys, result)); + slot.fetch_result(iter, seq, score, bind_var_keys, result); } - Ok(()) } fn get_aggregate_funs(&self, funs: &mut Vec>) { @@ -187,12 +184,12 @@ impl Returnable for RetArray { } } - fn json_result(&self, results: &mut VecDeque) -> Result { + fn json_result(&self, results: &mut VecDeque) -> JsonValue { let mut vec = Vec::with_capacity(self.slots.len()); for slot in self.slots.iter() { - vec.push(try!(slot.json_result(results))); + vec.push(slot.json_result(results)); } - Ok(JsonValue::Array(vec)) + JsonValue::Array(vec) } } @@ -206,12 +203,12 @@ pub struct RetHidden { impl Returnable for RetHidden { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { + result: &mut VecDeque) { for ref unrendered in self.unrendered.iter() { - try!(unrendered.fetch_result(iter, seq, score, bind_var_keys, result)); + unrendered.fetch_result(iter, seq, score, bind_var_keys, result); } - self.visible.fetch_result(iter, seq, score, bind_var_keys, result) + self.visible.fetch_result(iter, seq, score, bind_var_keys, result); } fn get_aggregate_funs(&self, funs: &mut Vec>) { @@ -230,7 +227,7 @@ impl Returnable for RetHidden { self.visible.get_sorting(sorts); } - fn json_result(&self, results: &mut VecDeque) -> Result { + fn json_result(&self, results: &mut VecDeque) -> JsonValue { for _n in 0..self.unrendered.len() { // we already sorted at this point, now discard the values results.pop_front(); @@ -248,8 +245,7 @@ pub struct RetLiteral { impl Returnable for RetLiteral { fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, _score: f32, _bind_var_keys: &HashMap>, - _result: &mut VecDeque) -> Result<(), Error> { - Ok(()) + _result: &mut VecDeque) { } fn get_aggregate_funs(&self, _funs: &mut Vec>) { @@ -264,8 +260,8 @@ impl Returnable for RetLiteral { //noop } - fn json_result(&self, _results: &mut VecDeque) -> Result { - Ok(self.json.clone()) + fn json_result(&self, _results: &mut VecDeque) -> JsonValue { + self.json.clone() } } @@ -303,15 +299,15 @@ impl RetValue { } } - fn return_array(mut array: Vec<(u64, JsonValue)>) -> Result { + fn return_array(mut array: Vec<(u64, JsonValue)>) -> JsonValue { array.sort_by_key(|tuple| tuple.0); - Ok(JsonValue::Array(array.into_iter() + JsonValue::Array(array.into_iter() .map(|(_i, json)| json) - .collect())) + .collect()) } fn descend_return_path(iter: &mut DBIterator, seq: u64, kb: &mut KeyBuilder, - rp: &ReturnPath, mut rp_index: usize) -> Result, Error> { + rp: &ReturnPath, mut rp_index: usize) -> Option { while let Some(segment) = rp.nth(rp_index) { rp_index += 1; @@ -325,8 +321,8 @@ impl RetValue { loop { kb.push_array_index(i); i += 1; - if let Some(json) = try!(RetValue::descend_return_path(iter, seq, - &mut kb.clone(), rp, rp_index)) { + if let Some(json) = RetValue::descend_return_path(iter, seq, + &mut kb.clone(), rp, rp_index) { vec.push(json); kb.pop_array(); } else { @@ -348,9 +344,9 @@ impl RetValue { } if vec.is_empty() { - return Ok(None); + return None; } else { - return Ok(Some(JsonValue::Array(vec))); + return Some(JsonValue::Array(vec)); } } } @@ -370,25 +366,24 @@ impl RetValue { let (key, value) = match iter.next() { Some((key, value)) => (key, value), None => { - return Ok(None) + return None }, }; if !key.starts_with(value_key.as_bytes()) { - return Ok(None) + return None } - let json_value = try!(RetValue::fetch(&mut iter.peekable(), &value_key, - key, value)); - Ok(Some(json_value)) + let json_value = RetValue::fetch(&mut iter.peekable(), &value_key, key, value); + Some(json_value) } fn fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, - mut key: Box<[u8]>, mut value: Box<[u8]>) -> Result { + mut key: Box<[u8]>, mut value: Box<[u8]>) -> JsonValue { if key.len() == value_key.len() { // we have a key match! - return Ok(RetValue::bytes_to_json_value(value.as_ref())); + return RetValue::bytes_to_json_value(value.as_ref()); } let segment = { let key_str = unsafe{str::from_utf8_unchecked(&key)}; @@ -402,13 +397,13 @@ impl RetValue { let mut value_key_next = value_key.to_string() + &escaped; loop { - let json_val = try!(RetValue::fetch(iter, &value_key_next, key, value)); + let json_val = RetValue::fetch(iter, &value_key_next, key, value); object.push((unescaped, json_val)); let segment = match iter.peek() { Some(&(ref k, ref _v)) => { if !k.starts_with(value_key.as_bytes()) { - return Ok(JsonValue::Object(object)); + return JsonValue::Object(object); } let key_str = unsafe{str::from_utf8_unchecked(&k)}; @@ -416,7 +411,7 @@ impl RetValue { KeyBuilder::parse_first_key_value_segment(&remaining) }, - None => return Ok(JsonValue::Object(object)), + None => return JsonValue::Object(object), }; if let Some((Segment::ObjectKey(unescaped2), escaped2)) = segment { @@ -432,7 +427,7 @@ impl RetValue { value_key_next.truncate(value_key.len()); value_key_next.push_str(&escaped2); } else { - return Ok(JsonValue::Object(object)); + return JsonValue::Object(object); } } } @@ -443,8 +438,7 @@ impl RetValue { let mut value_key_next = value_key.to_string() + &escaped; loop { - let json_val = try!(RetValue::fetch(iter, &value_key_next, - key, value)); + let json_val = RetValue::fetch(iter, &value_key_next, key, value); array.push((i, json_val)); let segment = match iter.peek() { @@ -489,21 +483,19 @@ impl RetValue { impl Returnable for RetValue { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, _bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { + result: &mut VecDeque) { if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { //don't fetch anything for count(). just stick in a null result.push_back(JsonValue::Null); - return Ok(()); } let mut kb = KeyBuilder::new(); - if let Some(json) = try!(RetValue::descend_return_path(iter, seq, &mut kb, &self.rp, 0)) { + if let Some(json) = RetValue::descend_return_path(iter, seq, &mut kb, &self.rp, 0) { result.push_back(json); } else { result.push_back(self.default.clone()); } - Ok(()) } fn get_aggregate_funs(&self, funs: &mut Vec>) { @@ -518,9 +510,9 @@ impl Returnable for RetValue { sorts.push(self.sort_info.take()); } - fn json_result(&self, results: &mut VecDeque) -> Result { + fn json_result(&self, results: &mut VecDeque) -> JsonValue { if let Some(json) = results.pop_front() { - Ok(json) + json } else { panic!("missing result!"); } @@ -541,7 +533,7 @@ pub struct RetBind { impl Returnable for RetBind { fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { + result: &mut VecDeque) { if let Some(value_keys) = bind_var_keys.get(&self.bind_name) { let mut array = Vec::with_capacity(value_keys.len()); @@ -550,8 +542,8 @@ impl Returnable for RetBind { kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&base_key)); - if let Some(json) = try!(RetValue::descend_return_path(iter, seq, &mut kb, - &self.extra_rp, 0)) { + if let Some(json) = RetValue::descend_return_path(iter, seq, &mut kb, + &self.extra_rp, 0) { array.push(json); } else { array.push(self.default.clone()); @@ -559,10 +551,8 @@ impl Returnable for RetBind { } result.push_back(JsonValue::Array(array)); } else { - result.push_back(JsonValue::Array(vec![self.default.clone()])) + result.push_back(JsonValue::Array(vec![self.default.clone()])); } - - Ok(()) } fn get_aggregate_funs(&self, funs: &mut Vec>) { @@ -577,9 +567,9 @@ impl Returnable for RetBind { sorts.push(self.sort_info.take()); } - fn json_result(&self, results: &mut VecDeque) -> Result { + fn json_result(&self, results: &mut VecDeque) -> JsonValue { if let Some(json) = results.pop_front() { - Ok(json) + json } else { panic!("missing bind result!"); } @@ -594,9 +584,8 @@ pub struct RetScore { impl Returnable for RetScore { fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, score: f32, _bind_var_keys: &HashMap>, - result: &mut VecDeque) -> Result<(), Error> { + result: &mut VecDeque) { result.push_back(JsonValue::Number(score as f64)); - Ok(()) } fn get_aggregate_funs(&self, _funs: &mut Vec>) { @@ -611,9 +600,9 @@ impl Returnable for RetScore { sorts.push(self.sort_info.take()); } - fn json_result(&self, results: &mut VecDeque) -> Result { + fn json_result(&self, results: &mut VecDeque) -> JsonValue { if let Some(json) = results.pop_front() { - Ok(json) + json } else { panic!("missing score result!"); } diff --git a/src/term_index.rs b/src/term_index.rs new file mode 100644 index 0000000..8e6e8de --- /dev/null +++ b/src/term_index.rs @@ -0,0 +1,64 @@ +extern crate varint; + +use std::io::Cursor; +use std::str; + +use key_builder::KeyBuilder; +use query::DocResult; + +use rocksdb::{self, DBIterator, Snapshot, IteratorMode}; +use self::varint::VarintRead; + +pub struct DocResultIterator { + iter: DBIterator, + keypathword: String, +} + +impl DocResultIterator { + pub fn new(snapshot: &Snapshot, word: &str, kb: &KeyBuilder) -> DocResultIterator { + DocResultIterator { + iter: snapshot.iterator(IteratorMode::Start), + keypathword: kb.get_keypathword_only(&word), + } + } + + pub fn advance_gte(&mut self, start: &DocResult) { + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), + rocksdb::Direction::Forward)); + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + } + + pub fn next(&mut self) -> Option<(DocResult, TermPositions)> { + if let Some((key, value)) = self.iter.next() { + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return None + } + + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + + Some((dr, TermPositions{pos: value.into_vec()})) + } else { + None + } + } +} + + +pub struct TermPositions { + pos: Vec, +} + +impl TermPositions { + pub fn positions(self) -> Vec { + let mut bytes = Cursor::new(self.pos); + let mut positions = Vec::new(); + while let Ok(pos) = bytes.read_unsigned_varint_32() { + positions.push(pos); + } + positions + } +} From 5f1bf60b8bcd901bb05a2d66e42b08250c549fee Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 17 Feb 2017 11:16:21 -0800 Subject: [PATCH 073/122] Move aggregate code out of query.rs and into own module --- src/aggregates.rs | 293 ++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/parser.rs | 3 +- src/query.rs | 285 +------------------------------------------- src/returnable.rs | 3 +- 5 files changed, 299 insertions(+), 286 deletions(-) create mode 100644 src/aggregates.rs diff --git a/src/aggregates.rs b/src/aggregates.rs new file mode 100644 index 0000000..8bd054f --- /dev/null +++ b/src/aggregates.rs @@ -0,0 +1,293 @@ + +use std::cmp::Ordering; + +use json_value::JsonValue; + +#[derive(PartialEq, Eq, Clone)] +pub enum AggregateFun { + GroupAsc, + GroupDesc, + Sum, + Max, + MaxArray, + Min, + MinArray, + Array, + ArrayFlat, + Concat, + Avg, + Count, +} + +pub struct AggregateFunImpls { + // Initalizes for a computing the aggregate action (optional) + pub init: Option JsonValue>, + + // The actual aggregate action function + pub action: fn (&mut JsonValue, JsonValue, &JsonValue), + + // extracts the final aggregate value (optional) + pub extract: Option, +} + +impl AggregateFun { + pub fn get_fun_impls(&self) -> AggregateFunImpls { + match self { + &AggregateFun::GroupAsc => panic!("cannot get aggregate fun for grouping!"), + &AggregateFun::GroupDesc => panic!("cannot get aggregate fun for grouping!"), + &AggregateFun::Sum => AggregateFunImpls{ + init: Some(AggregateFun::sum_init), + action: AggregateFun::sum, + extract: None, + }, + &AggregateFun::Max => AggregateFunImpls{ + init: None, + action: AggregateFun::max, + extract: None, + }, + &AggregateFun::Min => AggregateFunImpls{ + init: None, + action: AggregateFun::min, + extract: None, + }, + &AggregateFun::MaxArray => AggregateFunImpls{ + init: Some(AggregateFun::max_array_init), + action: AggregateFun::max_array, + extract: None, + }, + &AggregateFun::MinArray => AggregateFunImpls{ + init: Some(AggregateFun::min_array_init), + action: AggregateFun::min_array, + extract: None, + }, + &AggregateFun::Array => AggregateFunImpls{ + init: Some(AggregateFun::array_init), + action: AggregateFun::array, + extract: None, + }, + &AggregateFun::ArrayFlat => AggregateFunImpls{ + init: Some(AggregateFun::array_flat_init), + action: AggregateFun::array_flat, + extract: None, + }, + &AggregateFun::Concat => AggregateFunImpls{ + init: Some(AggregateFun::concat_init), + action: AggregateFun::concat, + extract: None, + }, + &AggregateFun::Avg => AggregateFunImpls{ + init: Some(AggregateFun::avg_init), + action: AggregateFun::avg, + extract: Some(AggregateFun::avg_final), + }, + &AggregateFun::Count => AggregateFunImpls{ + init: Some(AggregateFun::count_init), + action: AggregateFun::count, + extract: None, + }, + } + } + + fn sum_init(existing: JsonValue) -> JsonValue { + let mut base = JsonValue::Number(0.0); + AggregateFun::sum(&mut base, existing, &JsonValue::Null); + base + } + + fn sum(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + match new { + JsonValue::Number(new) => { + if let &mut JsonValue::Number(ref mut existing) = existing { + *existing += new; + } + }, + JsonValue::Array(vec) => { + for v in vec { + AggregateFun::sum(existing, v, _user_arg); + } + }, + _ => (), + } + } + + fn max(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if *existing < new { + *existing = new + } + } + + fn min(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if *existing > new { + *existing = new + } + } + + fn max_array_init(existing: JsonValue) -> JsonValue { + // The default value is an array, which can never be a value because arrays are always + // traversed. It's possible we never encounter a value due to only encountering empty + // arrays, in which case the final value is an empty array meaning no values encountered. + let mut val = JsonValue::Array(vec![]); + AggregateFun::max_array(&mut val, existing, &JsonValue::Null); + val + } + + fn max_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec { + AggregateFun::max_array(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(_) = existing { + *existing = new; + } else if (*existing).cmp(&new) == Ordering::Less { + *existing = new; + } + } + } + + fn min_array_init(existing: JsonValue) -> JsonValue { + // The default value is an array, which can never be a value because arrays are always + // traversed. It's possible we never encounter a value due to only encountering empty + // arrays, in which case the final value is an empty array meaning no values encountered. + let mut val = JsonValue::Array(vec![]); + AggregateFun::min_array(&mut val, existing, &JsonValue::Null); + val + } + + fn min_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec { + AggregateFun::min_array(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(_) = existing { + *existing = new; + } else if (*existing).cmp(&new) == Ordering::Greater { + *existing = new; + } + } + } + + fn array_init(existing: JsonValue) -> JsonValue { + JsonValue::Array(vec![existing]) + } + + fn array(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let &mut JsonValue::Array(ref mut existing) = existing { + existing.push(new); + } + } + + fn array_flat_init(existing: JsonValue) -> JsonValue { + let mut new = JsonValue::Array(vec![]); + AggregateFun::array_flat(&mut new, existing, &JsonValue::Null); + new + } + + fn array_flat(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Array(vec) = new { + for v in vec.into_iter() { + AggregateFun::array_flat(existing, v, _user_arg); + } + } else { + if let &mut JsonValue::Array(ref mut existing) = existing { + existing.push(new); + } + } + } + + fn concat_init(existing: JsonValue) -> JsonValue { + if let JsonValue::String(_) = existing { + existing + } else { + JsonValue::String(String::new()) + } + } + + fn concat(existing: &mut JsonValue, new: JsonValue, user_arg: &JsonValue) { + if let &mut JsonValue::String(ref mut existing) = existing { + if let JsonValue::String(new) = new { + if let &JsonValue::String(ref user_arg) = user_arg { + existing.push_str(&user_arg); + existing.push_str(&new); + } + } + } + } + + fn avg_init(existing: JsonValue) -> JsonValue { + if let JsonValue::Number(_) = existing { + JsonValue::Array(vec![existing, JsonValue::Number(1.0)]) + } else if let JsonValue::Array(_) = existing { + let mut avg = JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]); + AggregateFun::avg(&mut avg, existing, &JsonValue::Null); + avg + } else { + JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]) + } + } + + fn avg(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { + if let JsonValue::Number(new) = new { + if let &mut JsonValue::Array(ref mut array) = existing { + let mut avg = if let &JsonValue::Number(ref avg) = &array[0] { + *avg + } else { + // can't happen but compiler need this here + 1.0 + }; + + let mut count = if let &JsonValue::Number(ref count) = &array[1] { + *count + } else { + // can't happen but compiler need this here + 1.0 + }; + + avg = (avg * count + new) / (count + 1.0); + count += 1.0; + array[0] = JsonValue::Number(avg); + array[1] = JsonValue::Number(count); + } + } else if let JsonValue::Array(vec) = new { + for v in vec.into_iter() { + AggregateFun::avg(existing, v, _user_arg); + } + } + } + + fn avg_final(existing: &mut JsonValue) { + let json = if let &mut JsonValue::Array(ref mut array) = existing { + if let &JsonValue::Number(ref avg) = &array[0] { + if let &JsonValue::Number(ref count) = &array[1] { + if *count == 0.0 { + JsonValue::Null + } else { + JsonValue::Number(*avg) + } + } else { + // can't happen but compiler need this here + JsonValue::Null + } + } else { + // can't happen but compiler need this here + JsonValue::Null + } + } else { + // can't happen but compiler need this here + JsonValue::Null + }; + *existing = json + } + + fn count_init(_existing: JsonValue) -> JsonValue { + JsonValue::Number(1.0) + } + + fn count(existing: &mut JsonValue, _: JsonValue, _user_arg: &JsonValue) { + if let &mut JsonValue::Number(ref mut num) = existing { + *num += 1.0; + } + } +} + diff --git a/src/lib.rs b/src/lib.rs index 655e47e..b822c1f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ extern crate rocksdb; +mod aggregates; mod error; mod filters; mod json_shred; diff --git a/src/parser.rs b/src/parser.rs index 6020daf..6b56a05 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -9,7 +9,8 @@ use error::Error; use key_builder::KeyBuilder; use stems::Stems; use json_value::JsonValue; -use query::{Sort, AggregateFun, SortInfo, SortField}; +use query::{Sort, SortInfo, SortField}; +use aggregates::AggregateFun; use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore, ReturnPath}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, diff --git a/src/query.rs b/src/query.rs index 60b0258..9207768 100644 --- a/src/query.rs +++ b/src/query.rs @@ -12,6 +12,7 @@ use index::Index; use parser::Parser; use json_value::{JsonValue}; use filters::QueryRuntimeFilter; +use aggregates::AggregateFun; use returnable::{Returnable, RetValue, RetScore, RetHidden, ReturnPath}; // TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs @@ -645,290 +646,6 @@ impl<'a> Iterator for QueryResults<'a> { } } - -#[derive(PartialEq, Eq, Clone)] -pub enum AggregateFun { - GroupAsc, - GroupDesc, - Sum, - Max, - MaxArray, - Min, - MinArray, - Array, - ArrayFlat, - Concat, - Avg, - Count, -} - -struct AggregateFunImpls { - init: Option JsonValue>, - action: fn (&mut JsonValue, JsonValue, &JsonValue), - extract: Option, -} - -impl AggregateFun { - fn get_fun_impls(&self) -> AggregateFunImpls { - match self { - &AggregateFun::GroupAsc => panic!("cannot get aggregate fun for grouping!"), - &AggregateFun::GroupDesc => panic!("cannot get aggregate fun for grouping!"), - &AggregateFun::Sum => AggregateFunImpls{ - init: Some(AggregateFun::sum_init), - action: AggregateFun::sum, - extract: None, - }, - &AggregateFun::Max => AggregateFunImpls{ - init: None, - action: AggregateFun::max, - extract: None, - }, - &AggregateFun::Min => AggregateFunImpls{ - init: None, - action: AggregateFun::min, - extract: None, - }, - &AggregateFun::MaxArray => AggregateFunImpls{ - init: Some(AggregateFun::max_array_init), - action: AggregateFun::max_array, - extract: None, - }, - &AggregateFun::MinArray => AggregateFunImpls{ - init: Some(AggregateFun::min_array_init), - action: AggregateFun::min_array, - extract: None, - }, - &AggregateFun::Array => AggregateFunImpls{ - init: Some(AggregateFun::array_init), - action: AggregateFun::array, - extract: None, - }, - &AggregateFun::ArrayFlat => AggregateFunImpls{ - init: Some(AggregateFun::array_flat_init), - action: AggregateFun::array_flat, - extract: None, - }, - &AggregateFun::Concat => AggregateFunImpls{ - init: Some(AggregateFun::concat_init), - action: AggregateFun::concat, - extract: None, - }, - &AggregateFun::Avg => AggregateFunImpls{ - init: Some(AggregateFun::avg_init), - action: AggregateFun::avg, - extract: Some(AggregateFun::avg_final), - }, - &AggregateFun::Count => AggregateFunImpls{ - init: Some(AggregateFun::count_init), - action: AggregateFun::count, - extract: None, - }, - } - } - - fn sum_init(existing: JsonValue) -> JsonValue { - let mut base = JsonValue::Number(0.0); - AggregateFun::sum(&mut base, existing, &JsonValue::Null); - base - } - - fn sum(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - match new { - JsonValue::Number(new) => { - if let &mut JsonValue::Number(ref mut existing) = existing { - *existing += new; - } - }, - JsonValue::Array(vec) => { - for v in vec { - AggregateFun::sum(existing, v, _user_arg); - } - }, - _ => (), - } - } - - fn max(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - if *existing < new { - *existing = new - } - } - - fn min(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - if *existing > new { - *existing = new - } - } - - fn max_array_init(existing: JsonValue) -> JsonValue { - // The default value is an array, which can never be a value because arrays are always - // traversed. It's possible we never encounter a value due to only encountering empty - // arrays, in which case the final value is an empty array meaning no values encountered. - let mut val = JsonValue::Array(vec![]); - AggregateFun::max_array(&mut val, existing, &JsonValue::Null); - val - } - - fn max_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - if let JsonValue::Array(vec) = new { - for v in vec { - AggregateFun::max_array(existing, v, _user_arg); - } - } else { - if let &mut JsonValue::Array(_) = existing { - *existing = new; - } else if (*existing).cmp(&new) == Ordering::Less { - *existing = new; - } - } - } - - fn min_array_init(existing: JsonValue) -> JsonValue { - // The default value is an array, which can never be a value because arrays are always - // traversed. It's possible we never encounter a value due to only encountering empty - // arrays, in which case the final value is an empty array meaning no values encountered. - let mut val = JsonValue::Array(vec![]); - AggregateFun::min_array(&mut val, existing, &JsonValue::Null); - val - } - - fn min_array(mut existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - if let JsonValue::Array(vec) = new { - for v in vec { - AggregateFun::min_array(existing, v, _user_arg); - } - } else { - if let &mut JsonValue::Array(_) = existing { - *existing = new; - } else if (*existing).cmp(&new) == Ordering::Greater { - *existing = new; - } - } - } - - fn array_init(existing: JsonValue) -> JsonValue { - JsonValue::Array(vec![existing]) - } - - fn array(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - if let &mut JsonValue::Array(ref mut existing) = existing { - existing.push(new); - } - } - - fn array_flat_init(existing: JsonValue) -> JsonValue { - let mut new = JsonValue::Array(vec![]); - AggregateFun::array_flat(&mut new, existing, &JsonValue::Null); - new - } - - fn array_flat(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - if let JsonValue::Array(vec) = new { - for v in vec.into_iter() { - AggregateFun::array_flat(existing, v, _user_arg); - } - } else { - if let &mut JsonValue::Array(ref mut existing) = existing { - existing.push(new); - } - } - } - - fn concat_init(existing: JsonValue) -> JsonValue { - if let JsonValue::String(_) = existing { - existing - } else { - JsonValue::String(String::new()) - } - } - - fn concat(existing: &mut JsonValue, new: JsonValue, user_arg: &JsonValue) { - if let &mut JsonValue::String(ref mut existing) = existing { - if let JsonValue::String(new) = new { - if let &JsonValue::String(ref user_arg) = user_arg { - existing.push_str(&user_arg); - existing.push_str(&new); - } - } - } - } - - fn avg_init(existing: JsonValue) -> JsonValue { - if let JsonValue::Number(_) = existing { - JsonValue::Array(vec![existing, JsonValue::Number(1.0)]) - } else if let JsonValue::Array(_) = existing { - let mut avg = JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]); - AggregateFun::avg(&mut avg, existing, &JsonValue::Null); - avg - } else { - JsonValue::Array(vec![JsonValue::Number(0.0), JsonValue::Number(0.0)]) - } - } - - fn avg(existing: &mut JsonValue, new: JsonValue, _user_arg: &JsonValue) { - if let JsonValue::Number(new) = new { - if let &mut JsonValue::Array(ref mut array) = existing { - let mut avg = if let &JsonValue::Number(ref avg) = &array[0] { - *avg - } else { - // can't happen but compiler need this here - 1.0 - }; - - let mut count = if let &JsonValue::Number(ref count) = &array[1] { - *count - } else { - // can't happen but compiler need this here - 1.0 - }; - - avg = (avg * count + new) / (count + 1.0); - count += 1.0; - array[0] = JsonValue::Number(avg); - array[1] = JsonValue::Number(count); - } - } else if let JsonValue::Array(vec) = new { - for v in vec.into_iter() { - AggregateFun::avg(existing, v, _user_arg); - } - } - } - - fn avg_final(existing: &mut JsonValue) { - let json = if let &mut JsonValue::Array(ref mut array) = existing { - if let &JsonValue::Number(ref avg) = &array[0] { - if let &JsonValue::Number(ref count) = &array[1] { - if *count == 0.0 { - JsonValue::Null - } else { - JsonValue::Number(*avg) - } - } else { - // can't happen but compiler need this here - JsonValue::Null - } - } else { - // can't happen but compiler need this here - JsonValue::Null - } - } else { - // can't happen but compiler need this here - JsonValue::Null - }; - *existing = json - } - - fn count_init(_existing: JsonValue) -> JsonValue { - JsonValue::Number(1.0) - } - - fn count(existing: &mut JsonValue, _: JsonValue, _user_arg: &JsonValue) { - if let &mut JsonValue::Number(ref mut num) = existing { - *num += 1.0; - } - } -} - #[derive(PartialEq, Eq, Clone)] pub enum Sort { Asc, diff --git a/src/returnable.rs b/src/returnable.rs index b2d42e5..bffef2f 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -8,7 +8,8 @@ use std::iter::Iterator; use key_builder::{KeyBuilder, Segment}; use json_value::{JsonValue}; -use query::{AggregateFun, SortInfo}; +use query::SortInfo; +use aggregates::AggregateFun; use rocksdb::{self, DBIterator, IteratorMode}; From edffaf4e77853e83ee20431925745bf95e76a89a Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 23 Feb 2017 20:04:36 -0800 Subject: [PATCH 074/122] Move all rocksdb specific code into snapshot.rs So that the rocksdb code is isolated into the fewest files possible, all code that needs to interact is now in index.rs, json_shred.rs and new file snapshot.rs. One exception is ExactMatchFilter, which will eventually use the new R-Tree api. --- src/filters.rs | 98 +----------- src/index.rs | 13 +- src/json_shred.rs | 4 +- src/lib.rs | 2 +- src/parser.rs | 15 +- src/query.rs | 15 +- src/returnable.rs | 248 +++-------------------------- src/snapshot.rs | 389 ++++++++++++++++++++++++++++++++++++++++++++++ src/term_index.rs | 64 -------- 9 files changed, 438 insertions(+), 410 deletions(-) create mode 100644 src/snapshot.rs delete mode 100644 src/term_index.rs diff --git a/src/filters.rs b/src/filters.rs index 07625b1..db9e370 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -2,94 +2,13 @@ use std::str; use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::HashSet; -use index::Index; -use std::f32; use error::Error; use key_builder::KeyBuilder; use query::{DocResult, QueryScoringInfo}; -use returnable::RetValue; use json_value::JsonValue; -use term_index::{DocResultIterator}; - -// TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{self, DBIterator, Snapshot, IteratorMode}; - -struct Scorer { - iter: DBIterator, - idf: f32, - boost: f32, - kb: KeyBuilder, - word: String, - term_ordinal: usize, -} - -impl Scorer { - fn new(iter: DBIterator, word: &str, kb: &KeyBuilder, boost: f32) -> Scorer { - Scorer { - iter: iter, - idf: f32::NAN, - boost: boost, - kb: kb.clone(), - word: word.to_string(), - term_ordinal: 0, - } - } - - fn init(&mut self, qsi: &mut QueryScoringInfo) { - let key = self.kb.keypathword_count_key(&self.word); - let doc_freq = if let Some(bytes) = self.get_value(&key) { - Index::convert_bytes_to_i32(bytes.as_ref()) as f32 - } else { - 0.0 - }; - - let key = self.kb.keypath_count_key(); - let num_docs = if let Some(bytes) = self.get_value(&key) { - Index::convert_bytes_to_i32(bytes.as_ref()) as f32 - } else { - 0.0 - }; - - self.idf = 1.0 + (num_docs/(doc_freq + 1.0)).ln(); - self.term_ordinal = qsi.num_terms; - qsi.num_terms += 1; - qsi.sum_of_idt_sqs += self.idf * self.idf; - } - - fn get_value(&mut self, key: &str) -> Option> { - self.iter.set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); - if let Some((ret_key, ret_value)) = self.iter.next() { - if ret_key.len() == key.len() && ret_key.starts_with(key.as_bytes()) { - Some(ret_value) - } else { - None - } - } else { - None - } - } - - fn add_match_score(&mut self, num_matches: u32, dr: &mut DocResult) { - if self.should_score() { - let key = self.kb.field_length_key_from_doc_result(dr); - let total_field_words = if let Some(bytes) = self.get_value(&key) { - Index::convert_bytes_to_i32(bytes.as_ref()) as f32 - } else { - panic!("Couldn't find field length for a match!! WHAT!"); - }; - - let tf: f32 = (num_matches as f32).sqrt(); - let norm = 1.0/(total_field_words as f32).sqrt(); - let score = self.idf * self.idf * tf * norm * self.boost; - dr.add_score(self.term_ordinal, score); - } - } - - fn should_score(&self) -> bool { - !self.idf.is_nan() - } -} +use snapshot::{Snapshot, DocResultIterator, Scorer, JsonFetcher}; +use rocksdb::{self, DBIterator, IteratorMode}; pub trait QueryRuntimeFilter { fn first_result(&mut self, start: &DocResult) -> Option; @@ -113,8 +32,8 @@ impl StemmedWordFilter { pub fn new(snapshot: &Snapshot, stemmed_word: &str, kb: &KeyBuilder, boost: f32) -> StemmedWordFilter { StemmedWordFilter { - iter: DocResultIterator::new(snapshot, stemmed_word, kb), - scorer: Scorer::new(snapshot.iterator(IteratorMode::Start), stemmed_word, kb, boost), + iter: snapshot.new_term_doc_result_iterator(stemmed_word, kb), + scorer: snapshot.new_scorer(stemmed_word, kb, boost), } } } @@ -161,9 +80,8 @@ impl StemmedWordPosFilter { pub fn new(snapshot: &Snapshot, stemmed_word: &str, kb: &KeyBuilder, boost: f32) -> StemmedWordPosFilter { StemmedWordPosFilter{ - iter: DocResultIterator::new(snapshot, stemmed_word, kb), - scorer: Scorer::new(snapshot.iterator(IteratorMode::Start), - &stemmed_word, &kb, boost), + iter: snapshot.new_term_doc_result_iterator(stemmed_word, kb), + scorer: snapshot.new_scorer(&stemmed_word, &kb, boost), } } @@ -307,7 +225,7 @@ impl ExactMatchFilter { pub fn new(snapshot: &Snapshot, filter: StemmedPhraseFilter, kb: KeyBuilder, phrase: String, case_sensitive: bool) -> ExactMatchFilter { ExactMatchFilter { - iter: snapshot.iterator(IteratorMode::Start), + iter: snapshot.new_iterator(), filter: filter, kb: kb, phrase: if case_sensitive {phrase} else {phrase.to_lowercase()}, @@ -324,7 +242,7 @@ impl ExactMatchFilter { if let Some((key, value)) = self.iter.next() { debug_assert!(key.starts_with(value_key.as_bytes())); // must always be true! - if let JsonValue::String(string) = RetValue::bytes_to_json_value(&*value) { + if let JsonValue::String(string) = JsonFetcher::bytes_to_json_value(&*value) { let matches = if self.case_sensitive { self.phrase == string } else { diff --git a/src/index.rs b/src/index.rs index 96410af..c49790e 100644 --- a/src/index.rs +++ b/src/index.rs @@ -12,11 +12,12 @@ use std::cmp::Ordering; use self::varint::{VarintRead, VarintWrite}; -use rocksdb::{MergeOperands, IteratorMode, CompactionDecision}; +use rocksdb::{MergeOperands, IteratorMode, Snapshot as RocksSnapshot, CompactionDecision}; use error::Error; use json_shred::{Shredder}; use key_builder::KeyBuilder; +use snapshot::Snapshot; const NOISE_HEADER_VERSION: u64 = 1; @@ -91,6 +92,10 @@ impl Index { self.rocks.is_some() } + pub fn new_snapshot(&self) -> Snapshot { + Snapshot::new(RocksSnapshot::new(self.rocks.as_ref().unwrap())) + } + //This deletes the Rockdbs instance from disk pub fn drop(name: &str) -> Result<(), Error> { let ret = try!(rocksdb::DB::destroy(&rocksdb::Options::default(), name)); @@ -309,7 +314,7 @@ mod tests { use super::{Index, OpenOptions}; use query::Query; use std::str; - use returnable::RetValue; + use snapshot::JsonFetcher; use json_value::JsonValue; #[test] @@ -392,7 +397,7 @@ mod tests { for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { if key[0] as char == 'V' { let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); - results.push((key_string, RetValue::bytes_to_json_value(&*value))); + results.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); } } @@ -416,7 +421,7 @@ mod tests { for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { if key[0] as char == 'V' { let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); - results.push((key_string, RetValue::bytes_to_json_value(&*value))); + results.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); } } let expected = vec![ diff --git a/src/json_shred.rs b/src/json_shred.rs index ce47b3d..a26e45b 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -352,8 +352,8 @@ mod tests { use std::str; use index::{Index, OpenOptions}; - use returnable::RetValue; use json_value::JsonValue; + use snapshot::JsonFetcher; fn positions_from_rocks(rocks: &rocksdb::DB) -> Vec<(String, Vec)> { let mut result = Vec::new(); @@ -378,7 +378,7 @@ mod tests { for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { if key[0] as char == 'V' { let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); - result.push((key_string, RetValue::bytes_to_json_value(&*value))); + result.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); } } result diff --git a/src/lib.rs b/src/lib.rs index b822c1f..f477d26 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,8 +6,8 @@ mod filters; mod json_shred; mod key_builder; mod parser; +mod snapshot; mod stems; -mod term_index; mod returnable; pub mod repl; pub mod json_value; diff --git a/src/parser.rs b/src/parser.rs index 6b56a05..7bcdccf 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -16,10 +16,7 @@ use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, NotFilter}; - - -// TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::Snapshot; +use snapshot::Snapshot; pub struct Parser<'a, 'c> { @@ -1083,8 +1080,6 @@ mod tests { use super::Parser; use index::{Index, OpenOptions}; - - use rocksdb::Snapshot; #[test] fn test_whitespace() { @@ -1093,15 +1088,14 @@ mod tests { let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - let rocks = &index.rocks.unwrap(); - let mut snapshot = Snapshot::new(rocks); + let mut snapshot = index.new_snapshot(); let query = " \n \t test"; let mut parser = Parser::new(query, snapshot); parser.ws(); assert_eq!(parser.offset, 5); - snapshot = Snapshot::new(rocks); + snapshot = index.new_snapshot(); let query = "test".to_string(); let mut parser = Parser::new(&query, snapshot); parser.ws(); @@ -1115,8 +1109,7 @@ mod tests { let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - let rocks = &index.rocks.unwrap(); - let snapshot = Snapshot::new(rocks); + let snapshot = index.new_snapshot(); let query = r#"" \n \t test""#.to_string(); let mut parser = Parser::new(&query, snapshot); diff --git a/src/query.rs b/src/query.rs index 9207768..8e6cf8b 100644 --- a/src/query.rs +++ b/src/query.rs @@ -14,9 +14,8 @@ use json_value::{JsonValue}; use filters::QueryRuntimeFilter; use aggregates::AggregateFun; use returnable::{Returnable, RetValue, RetScore, RetHidden, ReturnPath}; +use snapshot::{Snapshot, JsonFetcher}; -// TODO vmx 2016-11-02: Make it import "rocksdb" properly instead of needing to import the individual tihngs -use rocksdb::{DBIterator, IteratorMode, Snapshot}; #[derive(Clone)] @@ -164,7 +163,7 @@ impl Query { return Err(Error::Parse("You must open the index first".to_string())); } - let snapshot = Snapshot::new(&index.rocks.as_ref().unwrap()); + let snapshot = index.new_snapshot(); let mut parser = Parser::new(query, snapshot); let mut filter = try!(parser.build_filter()); let mut sorts = try!(parser.sort_clause()); @@ -299,7 +298,7 @@ impl Query { Ok(QueryResults { filter: filter, doc_result_next: DocResult::new(), - iter: parser.snapshot.iterator(IteratorMode::Start), + fetcher: parser.snapshot.new_json_fetcher(), snapshot: parser.snapshot, returnable: returnable, needs_sorting_and_ags: needs_sorting_and_ags, @@ -323,7 +322,7 @@ pub struct QueryResults<'a> { filter: Box, doc_result_next: DocResult, snapshot: Snapshot<'a>, - iter: DBIterator, + fetcher: JsonFetcher, returnable: Box, needs_sorting_and_ags: bool, done_with_sorting_and_ags: bool, @@ -384,7 +383,7 @@ impl<'a> QueryResults<'a> { match seq { Some(seq) => { let key = format!("V{}#._id", seq); - match self.snapshot.get(&key.as_bytes()).unwrap() { + match self.snapshot.get(&key.as_bytes()) { // If there is an id, it's UTF-8. Strip off type leading byte Some(id) => Some(id.to_utf8().unwrap()[1..].to_string()), None => None @@ -406,7 +405,7 @@ impl<'a> QueryResults<'a> { Some(dr) => { let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - self.returnable.fetch_result(&mut self.iter, dr.seq, score, + self.returnable.fetch_result(&mut self.fetcher, dr.seq, score, &dr.bind_name_result, &mut results); self.in_buffer.push(results); if self.in_buffer.len() == self.limit { @@ -441,7 +440,7 @@ impl<'a> QueryResults<'a> { }; let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - self.returnable.fetch_result(&mut self.iter, dr.seq, score, + self.returnable.fetch_result(&mut self.fetcher, dr.seq, score, &dr.bind_name_result, &mut results); Some(self.returnable.json_result(&mut results)) } diff --git a/src/returnable.rs b/src/returnable.rs index bffef2f..528f9b3 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -1,18 +1,14 @@ use std::str; use std::collections::HashMap; -use std::iter::Peekable; -use std::mem::transmute; use std::collections::VecDeque; -use std::iter::Iterator; -use key_builder::{KeyBuilder, Segment}; -use json_value::{JsonValue}; +use key_builder::KeyBuilder; +use json_value::JsonValue; use query::SortInfo; +use snapshot::JsonFetcher; use aggregates::AggregateFun; -use rocksdb::{self, DBIterator, IteratorMode}; - #[derive(Clone)] pub enum PathSegment { ObjectKey(String), @@ -68,7 +64,7 @@ impl ReturnPath { key } - fn nth(&self, i: usize) -> Option<&PathSegment> { + pub fn nth(&self, i: usize) -> Option<&PathSegment> { if self.path.len() <= i { None } else { @@ -86,7 +82,7 @@ pub trait Returnable { /// and then each nested Returnable will fetch information about the document (fields or /// scores or bind variables etc) and convert them to JsonValues and add them to the result /// VecDeque. - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque); @@ -118,11 +114,11 @@ pub struct RetObject { } impl Returnable for RetObject { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { for &(ref _key, ref field) in self.fields.iter() { - field.fetch_result(iter, seq, score, bind_var_keys, result); + field.fetch_result(fetcher, seq, score, bind_var_keys, result); } } @@ -159,11 +155,11 @@ pub struct RetArray { } impl Returnable for RetArray { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { for ref slot in self.slots.iter() { - slot.fetch_result(iter, seq, score, bind_var_keys, result); + slot.fetch_result(fetcher, seq, score, bind_var_keys, result); } } @@ -202,14 +198,14 @@ pub struct RetHidden { } impl Returnable for RetHidden { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, score: f32, + fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { for ref unrendered in self.unrendered.iter() { - unrendered.fetch_result(iter, seq, score, bind_var_keys, result); + unrendered.fetch_result(fetcher, seq, score, bind_var_keys, result); } - self.visible.fetch_result(iter, seq, score, bind_var_keys, result); + self.visible.fetch_result(fetcher, seq, score, bind_var_keys, result); } fn get_aggregate_funs(&self, funs: &mut Vec>) { @@ -244,7 +240,7 @@ pub struct RetLiteral { } impl Returnable for RetLiteral { - fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, _score: f32, + fn fetch_result(&self, _fetcher: &mut JsonFetcher, _seq: u64, _score: f32, _bind_var_keys: &HashMap>, _result: &mut VecDeque) { } @@ -275,224 +271,18 @@ pub struct RetValue { pub sort_info: Option, } -impl RetValue { - pub fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { - match bytes[0] as char { - 's' => { - let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); - JsonValue::String(string) - }, - 'f' => { - assert!(bytes.len() == 9); - let mut bytes2: [u8; 8] = [0; 8]; - for (n, b) in bytes[1..9].iter().enumerate() { - bytes2[n] = *b; - } - let double: f64 = unsafe{transmute(bytes2)}; - JsonValue::Number(double) - }, - 'T' => JsonValue::True, - 'F' => JsonValue::False, - 'N' => JsonValue::Null, - 'o' => JsonValue::Object(vec![]), - 'a' => JsonValue::Array(vec![]), - what => panic!("unexpected type tag in value: {}", what), - } - } - - fn return_array(mut array: Vec<(u64, JsonValue)>) -> JsonValue { - array.sort_by_key(|tuple| tuple.0); - JsonValue::Array(array.into_iter() - .map(|(_i, json)| json) - .collect()) - } - - fn descend_return_path(iter: &mut DBIterator, seq: u64, kb: &mut KeyBuilder, - rp: &ReturnPath, mut rp_index: usize) -> Option { - - while let Some(segment) = rp.nth(rp_index) { - rp_index += 1; - match segment { - &PathSegment::ObjectKey(ref string) => { - kb.push_object_key(string); - }, - &PathSegment::ArrayAll => { - let mut i = 0; - let mut vec = Vec::new(); - loop { - kb.push_array_index(i); - i += 1; - if let Some(json) = RetValue::descend_return_path(iter, seq, - &mut kb.clone(), rp, rp_index) { - vec.push(json); - kb.pop_array(); - } else { - // we didn't get a value, is it because the array ends or the - // full path isn't there? check as there might be more array elements - // with a full path that does match. - let value_key = kb.value_key(seq); - kb.pop_array(); - - // Seek in index to >= entry - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - - if let Some((key, _value)) = iter.next() { - if key.starts_with(value_key.as_bytes()) { - // yes it exists. loop again. - continue; - } - } - - if vec.is_empty() { - return None; - } else { - return Some(JsonValue::Array(vec)); - } - } - } - }, - &PathSegment::Array(ref index) => { - kb.push_array_index(*index); - } - } - } - - let value_key = kb.value_key(seq); - - // Seek in index to >= entry - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - - let (key, value) = match iter.next() { - Some((key, value)) => (key, value), - None => { - return None - }, - }; - - if !key.starts_with(value_key.as_bytes()) { - return None - } - - let json_value = RetValue::fetch(&mut iter.peekable(), &value_key, key, value); - Some(json_value) - } - fn fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, - mut key: Box<[u8]>, mut value: Box<[u8]>) -> JsonValue { - - if key.len() == value_key.len() { - // we have a key match! - return RetValue::bytes_to_json_value(value.as_ref()); - } - let segment = { - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let remaining = &key_str[value_key.len()..]; - KeyBuilder::parse_first_key_value_segment(&remaining) - }; - - match segment { - Some((Segment::ObjectKey(mut unescaped), escaped)) => { - let mut object: Vec<(String, JsonValue)> = Vec::new(); - - let mut value_key_next = value_key.to_string() + &escaped; - loop { - let json_val = RetValue::fetch(iter, &value_key_next, key, value); - object.push((unescaped, json_val)); - - let segment = match iter.peek() { - Some(&(ref k, ref _v)) => { - if !k.starts_with(value_key.as_bytes()) { - return JsonValue::Object(object); - } - - let key_str = unsafe{str::from_utf8_unchecked(&k)}; - let remaining = &key_str[value_key.len()..]; - - KeyBuilder::parse_first_key_value_segment(&remaining) - }, - None => return JsonValue::Object(object), - }; - - if let Some((Segment::ObjectKey(unescaped2), escaped2)) = segment { - unescaped = unescaped2; - // advance the peeked iter - match iter.next() { - Some((k, v)) => { - key = k; - value = v; - } - None => panic!("couldn't advanced already peeked iter"), - }; - value_key_next.truncate(value_key.len()); - value_key_next.push_str(&escaped2); - } else { - return JsonValue::Object(object); - } - } - } - Some((Segment::Array(mut i), escaped)) => { - // we use a tuple with ordinal because we encounter - // elements in lexical sorting order instead of ordinal order - let mut array: Vec<(u64, JsonValue)> = Vec::new(); - - let mut value_key_next = value_key.to_string() + &escaped; - loop { - let json_val = RetValue::fetch(iter, &value_key_next, key, value); - array.push((i, json_val)); - - let segment = match iter.peek() { - Some(&(ref k, ref _v)) => { - if !k.starts_with(value_key.as_bytes()) { - return RetValue::return_array(array); - } - - let key_str = unsafe{str::from_utf8_unchecked(&k)}; - let remaining = &key_str[value_key.len()..]; - - KeyBuilder::parse_first_key_value_segment(&remaining) - }, - None => return RetValue::return_array(array), - }; - - if let Some((Segment::Array(i2), escaped2)) = segment { - i = i2; - // advance the already peeked iter - match iter.next() { - Some((k, v)) => { - key = k; - value = v; - }, - None => panic!("couldn't advanced already peeked iter"), - }; - value_key_next.truncate(value_key.len()); - value_key_next.push_str(&escaped2); - } else { - return RetValue::return_array(array); - } - } - }, - None => { - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - panic!("somehow couldn't parse key segment {} {}", value_key, key_str); - }, - } - } -} impl Returnable for RetValue { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, + fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, _score: f32, _bind_var_keys: &HashMap>, result: &mut VecDeque) { if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { //don't fetch anything for count(). just stick in a null result.push_back(JsonValue::Null); } - let mut kb = KeyBuilder::new(); - - if let Some(json) = RetValue::descend_return_path(iter, seq, &mut kb, &self.rp, 0) { + if let Some(json) = fetcher.fetch(seq, &mut kb, &self.rp) { result.push_back(json); } else { result.push_back(self.default.clone()); @@ -532,7 +322,7 @@ pub struct RetBind { } impl Returnable for RetBind { - fn fetch_result(&self, iter: &mut DBIterator, seq: u64, _score: f32, + fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, _score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { @@ -540,11 +330,9 @@ impl Returnable for RetBind { let mut array = Vec::with_capacity(value_keys.len()); for base_key in value_keys { let mut kb = KeyBuilder::new(); - kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&base_key)); - if let Some(json) = RetValue::descend_return_path(iter, seq, &mut kb, - &self.extra_rp, 0) { + if let Some(json) = fetcher.fetch(seq, &mut kb, &self.extra_rp) { array.push(json); } else { array.push(self.default.clone()); @@ -583,7 +371,7 @@ pub struct RetScore { } impl Returnable for RetScore { - fn fetch_result(&self, _iter: &mut DBIterator, _seq: u64, score: f32, + fn fetch_result(&self, _fetcher: &mut JsonFetcher, _seq: u64, score: f32, _bind_var_keys: &HashMap>, result: &mut VecDeque) { result.push_back(JsonValue::Number(score as f64)); diff --git a/src/snapshot.rs b/src/snapshot.rs new file mode 100644 index 0000000..56ccf22 --- /dev/null +++ b/src/snapshot.rs @@ -0,0 +1,389 @@ +use rocksdb::{self, DBIterator, Snapshot as RocksSnapshot, IteratorMode}; + +extern crate varint; + +use std::io::Cursor; +use std::str; +use std::mem::transmute; +use std::iter::Peekable; +use std::f32; + +use key_builder::{KeyBuilder, Segment}; +use query::{DocResult, QueryScoringInfo}; +use index::Index; +use returnable::{PathSegment, ReturnPath}; +use json_value::JsonValue; +use self::varint::VarintRead; + + +pub struct Snapshot<'a> { + rocks: RocksSnapshot<'a>, +} + +impl<'a> Snapshot<'a> { + pub fn new(rocks: RocksSnapshot) -> Snapshot { + Snapshot{rocks: rocks} + } + + pub fn new_term_doc_result_iterator(&self, term: &str, kb: &KeyBuilder) -> DocResultIterator { + DocResultIterator { + iter: self.rocks.iterator(IteratorMode::Start), + keypathword: kb.get_keypathword_only(&term), + } + + } + + pub fn get(&self, key: &[u8]) -> Option { + self.rocks.get(key).unwrap() + } + + pub fn new_scorer(&self, term: &str, kb: &KeyBuilder, boost: f32) -> Scorer { + Scorer { + iter: self.rocks.iterator(IteratorMode::Start), + idf: f32::NAN, + boost: boost, + kb: kb.clone(), + term: term.to_string(), + term_ordinal: 0, + } + } + + pub fn new_json_fetcher(&self) -> JsonFetcher { + JsonFetcher { + iter: self.rocks.iterator(IteratorMode::Start), + } + } + + pub fn new_iterator(&self) -> DBIterator { + self.rocks.iterator(IteratorMode::Start) + } +} + +pub struct DocResultIterator { + iter: DBIterator, + keypathword: String, +} + +impl DocResultIterator { + + pub fn advance_gte(&mut self, start: &DocResult) { + KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); + // Seek in index to >= entry + self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), + rocksdb::Direction::Forward)); + KeyBuilder::truncate_to_keypathword(&mut self.keypathword); + } + + pub fn next(&mut self) -> Option<(DocResult, TermPositions)> { + if let Some((key, value)) = self.iter.next() { + if !key.starts_with(self.keypathword.as_bytes()) { + // we passed the key path we are interested in. nothing left to do */ + return None + } + + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + + Some((dr, TermPositions{pos: value.into_vec()})) + } else { + None + } + } +} + + +pub struct TermPositions { + pos: Vec, +} + +impl TermPositions { + pub fn positions(self) -> Vec { + let mut bytes = Cursor::new(self.pos); + let mut positions = Vec::new(); + while let Ok(pos) = bytes.read_unsigned_varint_32() { + positions.push(pos); + } + positions + } +} + +pub struct Scorer { + iter: DBIterator, + idf: f32, + boost: f32, + kb: KeyBuilder, + term: String, + term_ordinal: usize, +} + +impl Scorer { + + pub fn init(&mut self, qsi: &mut QueryScoringInfo) { + let key = self.kb.keypathword_count_key(&self.term); + let doc_freq = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 + } else { + 0.0 + }; + + let key = self.kb.keypath_count_key(); + let num_docs = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 + } else { + 0.0 + }; + + self.idf = 1.0 + (num_docs/(doc_freq + 1.0)).ln(); + self.term_ordinal = qsi.num_terms; + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += self.idf * self.idf; + } + + pub fn get_value(&mut self, key: &str) -> Option> { + self.iter.set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); + if let Some((ret_key, ret_value)) = self.iter.next() { + if ret_key.len() == key.len() && ret_key.starts_with(key.as_bytes()) { + Some(ret_value) + } else { + None + } + } else { + None + } + } + + pub fn add_match_score(&mut self, num_matches: u32, dr: &mut DocResult) { + if self.should_score() { + let key = self.kb.field_length_key_from_doc_result(dr); + let total_field_words = if let Some(bytes) = self.get_value(&key) { + Index::convert_bytes_to_i32(bytes.as_ref()) as f32 + } else { + panic!("Couldn't find field length for a match!! WHAT!"); + }; + + let tf: f32 = (num_matches as f32).sqrt(); + let norm = 1.0/(total_field_words as f32).sqrt(); + let score = self.idf * self.idf * tf * norm * self.boost; + dr.add_score(self.term_ordinal, score); + } + } + + pub fn should_score(&self) -> bool { + !self.idf.is_nan() + } +} + + +pub struct JsonFetcher { + iter: DBIterator, +} + +impl JsonFetcher { + + pub fn fetch(&mut self, seq: u64, mut kb_base: &mut KeyBuilder, rp: &ReturnPath) -> Option { + JsonFetcher::descend_return_path(&mut self.iter, seq, &mut kb_base, &rp, 0) + } + + pub fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { + match bytes[0] as char { + 's' => { + let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); + JsonValue::String(string) + }, + 'f' => { + assert!(bytes.len() == 9); + let mut bytes2: [u8; 8] = [0; 8]; + for (n, b) in bytes[1..9].iter().enumerate() { + bytes2[n] = *b; + } + let double: f64 = unsafe{transmute(bytes2)}; + JsonValue::Number(double) + }, + 'T' => JsonValue::True, + 'F' => JsonValue::False, + 'N' => JsonValue::Null, + 'o' => JsonValue::Object(vec![]), + 'a' => JsonValue::Array(vec![]), + what => panic!("unexpected type tag in value: {}", what), + } + } + + fn return_array(mut array: Vec<(u64, JsonValue)>) -> JsonValue { + array.sort_by_key(|tuple| tuple.0); + JsonValue::Array(array.into_iter() + .map(|(_i, json)| json) + .collect()) + } + + fn descend_return_path(iter: &mut DBIterator, seq: u64, kb: &mut KeyBuilder, + rp: &ReturnPath, mut rp_index: usize) -> Option { + + while let Some(segment) = rp.nth(rp_index) { + rp_index += 1; + match segment { + &PathSegment::ObjectKey(ref string) => { + kb.push_object_key(string); + }, + &PathSegment::ArrayAll => { + let mut i = 0; + let mut vec = Vec::new(); + loop { + kb.push_array_index(i); + i += 1; + if let Some(json) = JsonFetcher::descend_return_path(iter, seq, + &mut kb.clone(), rp, rp_index) { + vec.push(json); + kb.pop_array(); + } else { + // we didn't get a value, is it because the array ends or the + // full path isn't there? check as there might be more array elements + // with a full path that does match. + let value_key = kb.value_key(seq); + kb.pop_array(); + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + if let Some((key, _value)) = iter.next() { + if key.starts_with(value_key.as_bytes()) { + // yes it exists. loop again. + continue; + } + } + + if vec.is_empty() { + return None; + } else { + return Some(JsonValue::Array(vec)); + } + } + } + }, + &PathSegment::Array(ref index) => { + kb.push_array_index(*index); + } + } + } + + let value_key = kb.value_key(seq); + + // Seek in index to >= entry + iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + + let (key, value) = match iter.next() { + Some((key, value)) => (key, value), + None => { + return None + }, + }; + + if !key.starts_with(value_key.as_bytes()) { + return None + } + Some(JsonFetcher::do_fetch(&mut iter.peekable(), &value_key, key, value)) + } + + fn do_fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, + mut key: Box<[u8]>, mut value: Box<[u8]>) -> JsonValue { + + if key.len() == value_key.len() { + // we have a key match! + return JsonFetcher::bytes_to_json_value(value.as_ref()); + } + let segment = { + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let remaining = &key_str[value_key.len()..]; + KeyBuilder::parse_first_key_value_segment(&remaining) + }; + + match segment { + Some((Segment::ObjectKey(mut unescaped), escaped)) => { + let mut object: Vec<(String, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = JsonFetcher::do_fetch(iter, &value_key_next, key, value); + object.push((unescaped, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + if !k.starts_with(value_key.as_bytes()) { + return JsonValue::Object(object); + } + + let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + }, + None => return JsonValue::Object(object), + }; + + if let Some((Segment::ObjectKey(unescaped2), escaped2)) = segment { + unescaped = unescaped2; + // advance the peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + } + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return JsonValue::Object(object); + } + } + } + Some((Segment::Array(mut i), escaped)) => { + // we use a tuple with ordinal because we encounter + // elements in lexical sorting order instead of ordinal order + let mut array: Vec<(u64, JsonValue)> = Vec::new(); + + let mut value_key_next = value_key.to_string() + &escaped; + loop { + let json_val = JsonFetcher::do_fetch(iter, &value_key_next, key, value); + array.push((i, json_val)); + + let segment = match iter.peek() { + Some(&(ref k, ref _v)) => { + if !k.starts_with(value_key.as_bytes()) { + return JsonFetcher::return_array(array); + } + + let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let remaining = &key_str[value_key.len()..]; + + KeyBuilder::parse_first_key_value_segment(&remaining) + }, + None => return JsonFetcher::return_array(array), + }; + + if let Some((Segment::Array(i2), escaped2)) = segment { + i = i2; + // advance the already peeked iter + match iter.next() { + Some((k, v)) => { + key = k; + value = v; + }, + None => panic!("couldn't advanced already peeked iter"), + }; + value_key_next.truncate(value_key.len()); + value_key_next.push_str(&escaped2); + } else { + return JsonFetcher::return_array(array); + } + } + }, + None => { + let key_str = unsafe{str::from_utf8_unchecked(&key)}; + panic!("somehow couldn't parse key segment {} {}", value_key, key_str); + }, + } + } +} + diff --git a/src/term_index.rs b/src/term_index.rs deleted file mode 100644 index 8e6e8de..0000000 --- a/src/term_index.rs +++ /dev/null @@ -1,64 +0,0 @@ -extern crate varint; - -use std::io::Cursor; -use std::str; - -use key_builder::KeyBuilder; -use query::DocResult; - -use rocksdb::{self, DBIterator, Snapshot, IteratorMode}; -use self::varint::VarintRead; - -pub struct DocResultIterator { - iter: DBIterator, - keypathword: String, -} - -impl DocResultIterator { - pub fn new(snapshot: &Snapshot, word: &str, kb: &KeyBuilder) -> DocResultIterator { - DocResultIterator { - iter: snapshot.iterator(IteratorMode::Start), - keypathword: kb.get_keypathword_only(&word), - } - } - - pub fn advance_gte(&mut self, start: &DocResult) { - KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); - // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); - KeyBuilder::truncate_to_keypathword(&mut self.keypathword); - } - - pub fn next(&mut self) -> Option<(DocResult, TermPositions)> { - if let Some((key, value)) = self.iter.next() { - if !key.starts_with(self.keypathword.as_bytes()) { - // we passed the key path we are interested in. nothing left to do */ - return None - } - - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - let dr = KeyBuilder::parse_doc_result_from_key(&key_str); - - Some((dr, TermPositions{pos: value.into_vec()})) - } else { - None - } - } -} - - -pub struct TermPositions { - pos: Vec, -} - -impl TermPositions { - pub fn positions(self) -> Vec { - let mut bytes = Cursor::new(self.pos); - let mut positions = Vec::new(); - while let Ok(pos) = bytes.read_unsigned_varint_32() { - positions.push(pos); - } - positions - } -} From 67ac3dde33826c3c5d8d60223097b4bf17d68401 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 27 Feb 2017 22:29:28 -0800 Subject: [PATCH 075/122] Avoid recompile when running `cargo test` When editing only test scripts in repo-tests/ cargo by default will unnecessarily recompile with each invocation of `cargo test`. This change eliminates the unnecessary recompile making the running the test scripts much faster. See documentation here for more information: http://doc.crates.io/build-script.html --- build.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index 3033f62..46a4178 100644 --- a/build.rs +++ b/build.rs @@ -1,4 +1,5 @@ fn main() { - + println!("cargo:rerun-if-changed=src/"); + println!("cargo:rerun-if-changed=tests/"); } From 49a027225d0e3fc6fd70050d1267a92f3f1fad87 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 27 Feb 2017 22:37:50 -0800 Subject: [PATCH 076/122] remove normalization of json keys Normalizing json keys was a mistake. We want to preserve full fidelity of keys (and original strings) in all circumstances. Removed the code and test. --- src/key_builder.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/key_builder.rs b/src/key_builder.rs index 8f0d34a..a993daa 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -4,8 +4,6 @@ use query::DocResult; use std::str; use std::cmp::Ordering; -use self::unicode_normalization::UnicodeNormalization; - pub enum Segment { ObjectKey(String), Array(u64), @@ -257,8 +255,7 @@ impl KeyBuilder { let mut escaped_key = String::with_capacity((key.len() * 2) + 1); // max expansion escaped_key.push('.'); - // normalize the key otherwise we might not match unnormalized but equivelent keys - for cc in key.nfkc() { + for cc in key.chars() { // Escape chars that conflict with delimiters if "\\$.!#".contains(cc) { escaped_key.push('\\'); @@ -434,13 +431,6 @@ mod tests { assert_eq!(kb.keypath_segments_len(), 0, "No segments so far"); } - #[test] - fn test_segments_canonical() { - let mut kb = KeyBuilder::new(); - kb.push_object_key("\u{0041}\u{030A}"); - assert_eq!(kb.stemmed_word_key("word", 1), "W.Å!word#1,"); - } - #[test] fn test_doc_result_parse() { let key = "W.foo$.bar$!word#123,1,0".to_string(); From 5589f9ac54a68cfb75c5d9df9bb032fcef0ef086 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 9 Mar 2017 17:27:08 -0800 Subject: [PATCH 077/122] Make a proper importable library Cargo crate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add to Cargo.toml `create-type = ["lib”]` to make an library crate --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 24c2e0a..3660d21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,10 @@ readme = "README.md" description = "Nested Object Inverted Search Engine" build = "build.rs" +[lib] +name = "noise" +crate-type = ["lib"] + [dependencies] rustc-serialize = "0.3.19" From 3947f601c0d6995d71b967e402015f0981801e13 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 9 Mar 2017 17:33:17 -0800 Subject: [PATCH 078/122] Add Damien as a project author --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 3660d21..bee3969 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "noise" version = "0.1.0" -authors = ["Volker Mische "] +authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" homepage = "https://github.com/pipedown/noise.git" license = "MIT OR Apache-2.0" From 67ba5f1bb67ef80d6b18d56ef216b2c121580d78 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 9 Mar 2017 17:48:36 -0800 Subject: [PATCH 079/122] Change name of crate and lib to noise_search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name “noise” is already taken by another crate. --- Cargo.toml | 4 ++-- src/main.rs | 4 ++-- tests/repl_tests.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bee3969..2ba77aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "noise" +name = "noise_search" version = "0.1.0" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" @@ -10,7 +10,7 @@ description = "Nested Object Inverted Search Engine" build = "build.rs" [lib] -name = "noise" +name = "noise_search" crate-type = ["lib"] diff --git a/src/main.rs b/src/main.rs index df4051d..45ae6e0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,6 @@ -extern crate noise; +extern crate noise_search; -use noise::repl::repl; +use noise_search::repl::repl; use std::env; use std::io::{self, BufReader}; diff --git a/tests/repl_tests.rs b/tests/repl_tests.rs index aa44b8a..8e4671b 100644 --- a/tests/repl_tests.rs +++ b/tests/repl_tests.rs @@ -1,10 +1,10 @@ -extern crate noise; +extern crate noise_search; use std::io::{Read, Write, BufReader}; use std::fs::{self, File}; use std::env; -use noise::repl::repl; +use noise_search::repl::repl; #[test] fn test_repl() { From f52f7efaaf523b84fced84887e18a5a932e6eaa6 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 31 Mar 2017 16:01:41 -0700 Subject: [PATCH 080/122] Remove WriteOptions from struct Index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It’s not marked thread safe, so we can’t use the same rocksdb instance in multiple threads. --- src/index.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/index.rs b/src/index.rs index c49790e..9323cb8 100644 --- a/src/index.rs +++ b/src/index.rs @@ -22,7 +22,6 @@ use snapshot::Snapshot; const NOISE_HEADER_VERSION: u64 = 1; pub struct Index { - write_options: rocksdb::WriteOptions, high_doc_seq: u64, pub rocks: Option, id_str_in_batch: HashSet, @@ -36,7 +35,6 @@ pub enum OpenOptions { impl Index { pub fn new() -> Index { Index { - write_options: rocksdb::WriteOptions::new(), high_doc_seq: 0, rocks: None, id_str_in_batch: HashSet::new(), @@ -68,7 +66,7 @@ impl Index { let mut bytes = Vec::with_capacity(8*2); bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); bytes.write(&Index::convert_u64_to_bytes(0)).unwrap(); - try!(rocks.put_opt(b"HDB", &bytes, &self.write_options)); + try!(rocks.put_opt(b"HDB", &bytes, &rocksdb::WriteOptions::new())); rocks } From b817af9e40d9925745bbf4432ec00a40b3e01758 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 31 Mar 2017 16:54:23 -0700 Subject: [PATCH 081/122] Remove WriteBatch from Index It also cannot be shared across threads. This breaks the tests but need to run an experiment. --- src/index.rs | 19 +++++++------------ src/repl.rs | 19 ++++++++++++++----- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/index.rs b/src/index.rs index 9323cb8..0ba5893 100644 --- a/src/index.rs +++ b/src/index.rs @@ -25,7 +25,6 @@ pub struct Index { high_doc_seq: u64, pub rocks: Option, id_str_in_batch: HashSet, - batch: Option, } pub enum OpenOptions { @@ -38,7 +37,6 @@ impl Index { high_doc_seq: 0, rocks: None, id_str_in_batch: HashSet::new(), - batch: None, } } // NOTE vmx 2016-10-13: Perhpas the name should be specified on `new()` as it is bound @@ -81,8 +79,6 @@ impl Index { // next 8 is high seq self.high_doc_seq = Index::convert_bytes_to_u64(&value[8..]); - self.batch = Some(rocksdb::WriteBatch::default()); - Ok(()) } @@ -100,7 +96,7 @@ impl Index { Ok(ret) } - pub fn add(&mut self, json: &str) -> Result { + pub fn add(&mut self, json: &str, mut batch: &mut rocksdb::WriteBatch) -> Result { if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); } @@ -128,14 +124,14 @@ impl Index { (self.high_doc_seq, docid) }; // now everything needs to be added to the batch, - try!(shredder.add_all_to_batch(seq, &mut self.batch.as_mut().unwrap())); + try!(shredder.add_all_to_batch(seq, &mut batch)); self.id_str_in_batch.insert(docid.clone()); Ok(docid) } /// Returns Ok(true) if the document was found and deleted, Ok(false) if it could not be found - pub fn delete(&mut self, docid: &str) -> Result { + pub fn delete(&mut self, docid: &str, mut batch: &mut rocksdb::WriteBatch) -> Result { if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); } @@ -147,7 +143,7 @@ impl Index { if let Some((seq, key_values)) = try!(self.gather_doc_fields(docid)) { let mut shredder = Shredder::new(); try!(shredder.delete_existing_doc(docid, seq, key_values, - &mut self.batch.as_mut().unwrap())); + &mut batch)); Ok(true) } else { Ok(false) @@ -186,7 +182,7 @@ impl Index { } // Store the current batch - pub fn flush(&mut self) -> Result<(), Error> { + pub fn flush(&mut self, mut batch: rocksdb::WriteBatch) -> Result<(), Error> { // Flush can only be called if the index is open if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); @@ -196,12 +192,11 @@ impl Index { let mut bytes = Vec::with_capacity(8*2); bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); bytes.write(&Index::convert_u64_to_bytes(self.high_doc_seq)).unwrap(); - try!(self.batch.as_mut().unwrap().put(b"HDB", &bytes)); + try!(batch.put(b"HDB", &bytes)); - let status = try!(rocks.write(self.batch.take().unwrap())); + let status = try!(rocks.write(batch)); // Make sure there's a always a valid WriteBarch after writing it into RocksDB, // else calls to `self.batch.as_mut().unwrap()` would panic. - self.batch = Some(rocksdb::WriteBatch::default()); self.id_str_in_batch.clear(); Ok(status) } diff --git a/src/repl.rs b/src/repl.rs index 031fe9f..21be06a 100644 --- a/src/repl.rs +++ b/src/repl.rs @@ -3,6 +3,8 @@ use query::Query; use json_value::{JsonValue, PrettyPrint}; use std::io::{Write, BufRead}; +use std::mem; +use rocksdb; fn is_command(str: &str) -> bool { @@ -18,6 +20,7 @@ fn is_command(str: &str) -> bool { pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { let mut index = Index::new(); + let mut batch = rocksdb::WriteBatch::default(); let mut lines = String::new(); let mut pretty = PrettyPrint::new("", "", ""); loop { @@ -50,7 +53,9 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { } else { // commit anything written if index.is_open() { - if let Err(reason) = index.flush() { + let mut batch2 = rocksdb::WriteBatch::default(); + mem::swap(&mut batch, &mut batch2); + if let Err(reason) = index.flush(batch2) { write!(w, "{}\n", reason).unwrap(); } } @@ -104,22 +109,26 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { }, } } else if lines.starts_with("add") { - match index.add(&lines[3..]) { + match index.add(&lines[3..], &mut batch) { Ok(id) => write!(w, "{}\n", JsonValue::str_to_literal(&id)).unwrap(), Err(reason) => write!(w, "{}\n", reason).unwrap(), } } else if lines.starts_with("del") { - match index.delete(&lines[3..].trim_left()) { + match index.delete(&lines[3..].trim_left(), &mut batch) { Ok(true) => write!(w, "ok\n").unwrap(), Ok(false) => write!(w, "not found\n").unwrap(), Err(reason) => write!(w, "{}\n", reason).unwrap(), } } else if lines.starts_with("commit") { - if let Err(reason) = index.flush() { + let mut batch2 = rocksdb::WriteBatch::default(); + mem::swap(&mut batch, &mut batch2); + if let Err(reason) = index.flush(batch2) { write!(w, "{}\n", reason).unwrap(); } } else if lines.starts_with("find") { - if let Err(reason) = index.flush() { + let mut batch2 = rocksdb::WriteBatch::default(); + mem::swap(&mut batch, &mut batch2); + if let Err(reason) = index.flush(batch2) { write!(w, "{}\n", reason).unwrap(); } else { match Query::get_matches(&lines, &index) { From d21eea748c8c45736b99cdc70c5eff2b797ed5d9 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 31 Mar 2017 17:33:27 -0700 Subject: [PATCH 082/122] Export rocksdb::batch for use by clients --- src/index.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index.rs b/src/index.rs index 0ba5893..d4069c2 100644 --- a/src/index.rs +++ b/src/index.rs @@ -13,6 +13,7 @@ use std::cmp::Ordering; use self::varint::{VarintRead, VarintWrite}; use rocksdb::{MergeOperands, IteratorMode, Snapshot as RocksSnapshot, CompactionDecision}; +pub use rocksdb::batch; use error::Error; use json_shred::{Shredder}; From 743442016adb3d8f30342c87edbfd89d87fa9db1 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 31 Mar 2017 17:35:30 -0700 Subject: [PATCH 083/122] Fix batch to WriteBatch --- src/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index d4069c2..31fccce 100644 --- a/src/index.rs +++ b/src/index.rs @@ -13,7 +13,7 @@ use std::cmp::Ordering; use self::varint::{VarintRead, VarintWrite}; use rocksdb::{MergeOperands, IteratorMode, Snapshot as RocksSnapshot, CompactionDecision}; -pub use rocksdb::batch; +pub use rocksdb::WriteBatch; use error::Error; use json_shred::{Shredder}; From 0ac8e5b4c54fe3fbda179b88f8373dd764077c0f Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sun, 2 Apr 2017 14:39:31 -0700 Subject: [PATCH 084/122] New Batch type Moved rocksdb::WriteBatch into new Batch type. Fixed all the tests. --- src/index.rs | 67 ++++++++++++++++++++++++++++++++-------------------- src/query.rs | 14 ++++++----- src/repl.rs | 11 ++++----- 3 files changed, 54 insertions(+), 38 deletions(-) diff --git a/src/index.rs b/src/index.rs index 31fccce..10a0cda 100644 --- a/src/index.rs +++ b/src/index.rs @@ -25,9 +25,22 @@ const NOISE_HEADER_VERSION: u64 = 1; pub struct Index { high_doc_seq: u64, pub rocks: Option, +} + +pub struct Batch { + wb: rocksdb::WriteBatch, id_str_in_batch: HashSet, } +impl Batch { + pub fn new() -> Batch { + Batch { + wb: rocksdb::WriteBatch::default(), + id_str_in_batch: HashSet::new(), + } + } +} + pub enum OpenOptions { Create } @@ -37,7 +50,6 @@ impl Index { Index { high_doc_seq: 0, rocks: None, - id_str_in_batch: HashSet::new(), } } // NOTE vmx 2016-10-13: Perhpas the name should be specified on `new()` as it is bound @@ -97,14 +109,14 @@ impl Index { Ok(ret) } - pub fn add(&mut self, json: &str, mut batch: &mut rocksdb::WriteBatch) -> Result { + pub fn add(&mut self, json: &str, mut batch: &mut Batch) -> Result { if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); } let mut shredder = Shredder::new(); let (seq, docid) = if let Some(docid) = try!(shredder.shred(json)) { // user supplied doc id, see if we have an existing one. - if self.id_str_in_batch.contains(&docid) { + if batch.id_str_in_batch.contains(&docid) { // oops use trying to add some doc 2x to this batch. return Err(Error::Write("Attempt to insert multiple docs with same _id" .to_string())); @@ -125,18 +137,18 @@ impl Index { (self.high_doc_seq, docid) }; // now everything needs to be added to the batch, - try!(shredder.add_all_to_batch(seq, &mut batch)); - self.id_str_in_batch.insert(docid.clone()); + try!(shredder.add_all_to_batch(seq, &mut batch.wb)); + batch.id_str_in_batch.insert(docid.clone()); Ok(docid) } /// Returns Ok(true) if the document was found and deleted, Ok(false) if it could not be found - pub fn delete(&mut self, docid: &str, mut batch: &mut rocksdb::WriteBatch) -> Result { + pub fn delete(&mut self, docid: &str, mut batch: &mut Batch) -> Result { if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); } - if self.id_str_in_batch.contains(docid) { + if batch.id_str_in_batch.contains(docid) { // oops use trying to delete a doc that's in the batch. Can't happen, return Err(Error::Write("Attempt to delete doc with same _id added earlier" .to_string())); @@ -144,7 +156,8 @@ impl Index { if let Some((seq, key_values)) = try!(self.gather_doc_fields(docid)) { let mut shredder = Shredder::new(); try!(shredder.delete_existing_doc(docid, seq, key_values, - &mut batch)); + &mut batch.wb)); + batch.id_str_in_batch.insert(docid.to_string()); Ok(true) } else { Ok(false) @@ -183,7 +196,7 @@ impl Index { } // Store the current batch - pub fn flush(&mut self, mut batch: rocksdb::WriteBatch) -> Result<(), Error> { + pub fn flush(&mut self, mut batch: Batch) -> Result<(), Error> { // Flush can only be called if the index is open if !self.is_open() { return Err(Error::Write("Index isn't open.".to_string())); @@ -193,12 +206,9 @@ impl Index { let mut bytes = Vec::with_capacity(8*2); bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); bytes.write(&Index::convert_u64_to_bytes(self.high_doc_seq)).unwrap(); - try!(batch.put(b"HDB", &bytes)); + try!(batch.wb.put(b"HDB", &bytes)); - let status = try!(rocks.write(batch)); - // Make sure there's a always a valid WriteBarch after writing it into RocksDB, - // else calls to `self.batch.as_mut().unwrap()` would panic. - self.id_str_in_batch.clear(); + let status = try!(rocks.write(batch.wb)); Ok(status) } @@ -305,7 +315,7 @@ impl Index { #[cfg(test)] mod tests { extern crate rocksdb; - use super::{Index, OpenOptions}; + use super::{Index, OpenOptions, Batch}; use query::Query; use std::str; use snapshot::JsonFetcher; @@ -319,20 +329,21 @@ mod tests { let mut index = Index::new(); //let db = super::Index::open("firstnoisedb", Option::None).unwrap(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - index.flush().unwrap(); + index.flush(Batch::new()).unwrap(); } #[test] fn test_uuid() { let dbname = "target/tests/testuuid"; let _ = Index::drop(dbname); + let mut batch = Batch::new(); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - let id = index.add(r#"{"foo":"bar"}"#).unwrap(); + let id = index.add(r#"{"foo":"bar"}"#, &mut batch).unwrap(); - index.flush().unwrap(); + index.flush(batch).unwrap(); let mut results = Query::get_matches(r#"find {foo:=="bar"}"#, &index).unwrap(); let query_id = results.get_next_id().unwrap(); @@ -344,15 +355,17 @@ mod tests { fn test_compaction() { let dbname = "target/tests/testcompaction"; let _ = Index::drop(dbname); + let mut batch = Batch::new(); let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - let id = index.add(r#"{"foo":"bar"}"#).unwrap(); - index.flush().unwrap(); + let id = index.add(r#"{"foo":"bar"}"#, &mut batch).unwrap(); + index.flush(batch).unwrap(); - index.delete(&id).unwrap(); - index.flush().unwrap(); + let mut batch = Batch::new(); + index.delete(&id, &mut batch).unwrap(); + index.flush(batch).unwrap(); let rocks = index.rocks.as_mut().unwrap(); @@ -381,9 +394,10 @@ mod tests { //index.flush().unwrap(); - let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}"#).unwrap(); + let mut batch = Batch::new(); + let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}"#, &mut batch).unwrap(); - index.flush().unwrap(); + index.flush(batch).unwrap(); { let rocks = index.rocks.as_mut().unwrap(); @@ -406,8 +420,9 @@ mod tests { assert_eq!(results, expected); } - let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": []}"#).unwrap(); - index.flush().unwrap(); + let mut batch = Batch::new(); + let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": []}"#, &mut batch).unwrap(); + index.flush(batch).unwrap(); let rocks = index.rocks.as_mut().unwrap(); diff --git a/src/query.rs b/src/query.rs index 8e6cf8b..8331711 100644 --- a/src/query.rs +++ b/src/query.rs @@ -673,7 +673,7 @@ mod tests { use super::Query; - use index::{Index, OpenOptions}; + use index::{Index, OpenOptions, Batch}; #[test] fn test_query_hello_world() { @@ -682,8 +682,10 @@ mod tests { let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#); - index.flush().unwrap(); + + let mut batch = Batch::new(); + let _ = index.add(r#"{"_id": "foo", "hello": "world"}"#, &mut batch); + index.flush(batch).unwrap(); let mut query_results = Query::get_matches(r#"find {hello:=="world"}"#, &index).unwrap(); //let mut query_results = Query::get_matches(r#"a.b[foo="bar"]"#.to_string(), &index).unwrap(); @@ -697,12 +699,12 @@ mod tests { let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - + let mut batch = Batch::new(); for ii in 1..100 { let data = ((ii % 25) + 97) as u8 as char; - let _ = index.add(&format!(r#"{{"_id":"{}", "data": "{}"}}"#, ii, data)); + let _ = index.add(&format!(r#"{{"_id":"{}", "data": "{}"}}"#, ii, data), &mut batch); } - index.flush().unwrap(); + index.flush(batch).unwrap(); let mut query_results = Query::get_matches(r#"find {data: == "u"}"#, &index).unwrap(); loop { diff --git a/src/repl.rs b/src/repl.rs index 21be06a..51d3aa9 100644 --- a/src/repl.rs +++ b/src/repl.rs @@ -1,10 +1,9 @@ -use index::{Index, OpenOptions}; +use index::{Index, OpenOptions, Batch}; use query::Query; use json_value::{JsonValue, PrettyPrint}; use std::io::{Write, BufRead}; use std::mem; -use rocksdb; fn is_command(str: &str) -> bool { @@ -20,7 +19,7 @@ fn is_command(str: &str) -> bool { pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { let mut index = Index::new(); - let mut batch = rocksdb::WriteBatch::default(); + let mut batch = Batch::new(); let mut lines = String::new(); let mut pretty = PrettyPrint::new("", "", ""); loop { @@ -53,7 +52,7 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { } else { // commit anything written if index.is_open() { - let mut batch2 = rocksdb::WriteBatch::default(); + let mut batch2 = Batch::new(); mem::swap(&mut batch, &mut batch2); if let Err(reason) = index.flush(batch2) { write!(w, "{}\n", reason).unwrap(); @@ -120,13 +119,13 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { Err(reason) => write!(w, "{}\n", reason).unwrap(), } } else if lines.starts_with("commit") { - let mut batch2 = rocksdb::WriteBatch::default(); + let mut batch2 = Batch::new(); mem::swap(&mut batch, &mut batch2); if let Err(reason) = index.flush(batch2) { write!(w, "{}\n", reason).unwrap(); } } else if lines.starts_with("find") { - let mut batch2 = rocksdb::WriteBatch::default(); + let mut batch2 = Batch::new(); mem::swap(&mut batch, &mut batch2); if let Err(reason) = index.flush(batch2) { write!(w, "{}\n", reason).unwrap(); From 4000f771067fc85e30dea03e2a45decb3d6b2efd Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sun, 2 Apr 2017 18:29:46 -0700 Subject: [PATCH 085/122] Fix broken test --- repl-tests/deletion_updates.noise | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repl-tests/deletion_updates.noise b/repl-tests/deletion_updates.noise index 5f2f9c2..94a3a78 100644 --- a/repl-tests/deletion_updates.noise +++ b/repl-tests/deletion_updates.noise @@ -32,7 +32,7 @@ del 5; ok add {"_id":"5", "A":"word"}; -"5" +Write error: Attempt to insert multiple docs with same _id # add again without committing add {"_id":"5", "A":"word"}; From e6850e6d17585443886334097ee9885bd0b667ba Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sun, 2 Apr 2017 18:30:12 -0700 Subject: [PATCH 086/122] Add support for getting filename --- src/index.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index 10a0cda..9c8d847 100644 --- a/src/index.rs +++ b/src/index.rs @@ -23,6 +23,7 @@ use snapshot::Snapshot; const NOISE_HEADER_VERSION: u64 = 1; pub struct Index { + name: String, high_doc_seq: u64, pub rocks: Option, } @@ -48,6 +49,7 @@ pub enum OpenOptions { impl Index { pub fn new() -> Index { Index { + name: String::new(), high_doc_seq: 0, rocks: None, } @@ -91,7 +93,7 @@ impl Index { assert_eq!(Index::convert_bytes_to_u64(&value[..8]), NOISE_HEADER_VERSION); // next 8 is high seq self.high_doc_seq = Index::convert_bytes_to_u64(&value[8..]); - + self.name = name.to_string(); Ok(()) } @@ -99,6 +101,10 @@ impl Index { self.rocks.is_some() } + pub fn get_name(&self) -> &str { + &self.name + } + pub fn new_snapshot(&self) -> Snapshot { Snapshot::new(RocksSnapshot::new(self.rocks.as_ref().unwrap())) } From 188e3f9df0888debc990c236c0d64dce7f8a227c Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 4 Apr 2017 00:24:26 -0700 Subject: [PATCH 087/122] Fix panic that occurs when Parser::must_consume runs out of string --- src/parser.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/parser.rs b/src/parser.rs index 7bcdccf..c55c3a1 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -64,9 +64,14 @@ impl<'a, 'c> Parser<'a, 'c> { self.ws(); Ok(()) } else { - Err(Error::Parse(format!("Expected '{}' at character {}, found {}.", + if self.offset == self.query.len() { + Err(Error::Parse(format!("Expected '{}' at character {} but query string ended.", + token, self.offset))) + } else { + Err(Error::Parse(format!("Expected '{}' at character {}, found {}.", token, self.offset, &self.query[self.offset..self.offset+1]))) + } } } @@ -1115,4 +1120,18 @@ mod tests { let mut parser = Parser::new(&query, snapshot); assert_eq!(parser.must_consume_string_literal().unwrap(), " \n \t test".to_string()); } + + #[test] + fn test_bad_query_syntax() { + let dbname = "target/tests/test_bad_query_syntax"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + let snapshot = index.new_snapshot(); + + let query = r#"find {foo: =="bar""#.to_string(); + let mut parser = Parser::new(&query, snapshot); + assert!(parser.find().is_err()); + } } \ No newline at end of file From aee2c020e36fb088eaf400e1d283d65b2a11295c Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Fri, 7 Apr 2017 00:02:46 +0200 Subject: [PATCH 088/122] Make code clearer The code was using `std::mem::swap()` to work around the borrow-checker. Without this call it's clearer what the code actually does. --- src/json_shred.rs | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index a26e45b..4f0114f 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -8,7 +8,6 @@ use std::io::Write; use std::str::Chars; use std::io::Cursor; use std::str; -use std::mem; use self::varint::VarintWrite; use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; @@ -51,7 +50,7 @@ impl Shredder { } } - fn add_entries(&mut self, text: &str, docseq: u64, + fn add_entries(kb: &mut KeyBuilder, text: &str, docseq: u64, batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { let stems = Stems::new(text); let mut word_to_word_positions = HashMap::new(); @@ -73,32 +72,32 @@ impl Shredder { } for (stemmed, (word_positions, count)) in word_to_word_positions { - let key = self.kb.stemmed_word_key(&stemmed, docseq); + let key = kb.stemmed_word_key(&stemmed, docseq); if delete { try!(batch.delete(&key.into_bytes())); } else { try!(batch.put(&key.into_bytes(), &word_positions.into_inner())); } - let key = self.kb.field_length_key(docseq); + let key = kb.field_length_key(docseq); if delete { try!(batch.delete(&key.into_bytes())); } else { try!(batch.put(&key.into_bytes(), &Index::convert_i32_to_bytes(total_words))); } - let key = self.kb.keypathword_count_key(&stemmed); + let key = kb.keypathword_count_key(&stemmed); if delete { try!(batch.merge(&key.into_bytes(), &Index::convert_i32_to_bytes(-count))); } else { try!(batch.merge(&key.into_bytes(), &Index::convert_i32_to_bytes(count))); } - let key = self.kb.keypath_count_key(); + let key = kb.keypath_count_key(); try!(batch.merge(&key.into_bytes(), one_enc_bytes.get_ref())); } - let key = self.kb.value_key(docseq); + let key = kb.value_key(docseq); if delete { try!(batch.delete(&key.into_bytes())); } else { @@ -184,32 +183,32 @@ impl Shredder { } pub fn add_all_to_batch(&mut self, seq: u64, - batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { - let mut key_values = BTreeMap::new(); - mem::swap(&mut key_values, &mut self.existing_key_value_to_delete); - for (key, value) in key_values.into_iter() { + batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + for (key, value) in &self.existing_key_value_to_delete { self.kb.clear(); self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); if value[0] as char == 's' { let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(self.add_entries(text, seq, batch, true)); + try!(Shredder::add_entries(&mut self.kb, text, seq, batch, true)); } else { - try!(batch.delete(&key.into_bytes())); + try!(batch.delete(key.as_bytes())); } } - let mut key_values = BTreeMap::new(); - mem::swap(&mut key_values, &mut self.shredded_key_values); - for (key, value) in key_values.into_iter() { + self.existing_key_value_to_delete = BTreeMap::new(); + + for (key, value) in &self.shredded_key_values { self.kb.clear(); self.kb.parse_value_key_path_only(&key); if value[0] as char == 's' { let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(self.add_entries(text, seq, batch, false)); + try!(Shredder::add_entries(&mut self.kb, text, seq, batch, false)); } else { let key = self.kb.value_key(seq); - try!(batch.put(&key.into_bytes(), &value.as_ref())); + try!(batch.put(&key.as_bytes(), &value.as_ref())); } } + self.shredded_key_values = BTreeMap::new(); + let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); try!(batch.put(&key.into_bytes(), &seq.to_string().as_bytes())); @@ -227,7 +226,7 @@ impl Shredder { self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); if value[0] as char == 's' { let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(self.add_entries(text, seq, batch, true)); + try!(Shredder::add_entries(&mut self.kb, text, seq, batch, true)); } else { try!(batch.delete(&key.into_bytes())); } From 4c7f3322dcbadbfdcc1f5549d7b0721993c5f71e Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 4 Apr 2017 14:24:53 +0200 Subject: [PATCH 089/122] Make comments doc comments --- src/key_builder.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/key_builder.rs b/src/key_builder.rs index a993daa..6ec7365 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -305,8 +305,8 @@ impl KeyBuilder { self.keypath.len() } - /* splits key into key path, seq and array path - ex "W.foo$.bar$.baz!word#123,0,0" -> ("W.foo$.bar$.bar!word", "123", "0,0") */ + /// splits key into key path, seq and array path + /// ex "W.foo$.bar$.baz!word#123,0,0" -> ("W.foo$.bar$.bar!word", "123", "0,0") fn split_keypath_seq_arraypath_from_key(str: &str) -> (&str, &str, &str) { let n = str.rfind("#").unwrap(); assert!(n != 0); @@ -317,7 +317,7 @@ impl KeyBuilder { (&str[..n], &seq_arraypath_str[..m], &seq_arraypath_str[m + 1..]) } - /* parses a seq and array path portion (ex "123,0,0,10) of a key into a doc result */ + /// parses a seq and array path portion (ex "123,0,0,10) of a key into a doc result pub fn parse_doc_result_from_key(str: &str) -> DocResult { let mut dr = DocResult::new(); let (_path_str, seq_str, arraypath_str) = From aee89d2da88974c0ba7fcce8ebb108eb040e770f Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 4 Apr 2017 14:28:22 +0200 Subject: [PATCH 090/122] Refactor json_primitive function --- src/parser.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index c55c3a1..d8ae6b7 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1000,9 +1000,18 @@ impl<'a, 'c> Parser<'a, 'c> { Ok(Some(try!(self.json_object()))) } else if self.could_consume("[") { Ok(Some(try!(self.json_array()))) - } else if let Some(string) = try!(self.consume_string_literal()) { - Ok(Some(JsonValue::String(string))) } else { + Ok(try!(self.json_primitive())) + } + } + + /// JSON primites are strings, numbers, booleans and null + fn json_primitive(&mut self) -> Result, Error> { + if let Some(string) = try!(self.consume_string_literal()) { + Ok(Some(JsonValue::String(string))) + } + // The else is needed becaue of https://github.com/rust-lang/rust/issues/37510 + else { if self.consume("true") { Ok(Some(JsonValue::True)) } else if self.consume("false") { From cca2cfb07be6c97e3bf8a866ce8cfb49b548d2ea Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Fri, 7 Apr 2017 00:32:54 +0200 Subject: [PATCH 091/122] Rename `add_entries` to `add_stemmed_entries()` for clarity --- src/json_shred.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 4f0114f..4761174 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -50,7 +50,7 @@ impl Shredder { } } - fn add_entries(kb: &mut KeyBuilder, text: &str, docseq: u64, + fn add_stemmed_entries(kb: &mut KeyBuilder, text: &str, docseq: u64, batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { let stems = Stems::new(text); let mut word_to_word_positions = HashMap::new(); @@ -189,7 +189,7 @@ impl Shredder { self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); if value[0] as char == 's' { let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(Shredder::add_entries(&mut self.kb, text, seq, batch, true)); + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); } else { try!(batch.delete(key.as_bytes())); } @@ -201,7 +201,7 @@ impl Shredder { self.kb.parse_value_key_path_only(&key); if value[0] as char == 's' { let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(Shredder::add_entries(&mut self.kb, text, seq, batch, false)); + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, false)); } else { let key = self.kb.value_key(seq); try!(batch.put(&key.as_bytes(), &value.as_ref())); @@ -226,7 +226,7 @@ impl Shredder { self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); if value[0] as char == 's' { let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(Shredder::add_entries(&mut self.kb, text, seq, batch, true)); + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); } else { try!(batch.delete(&key.into_bytes())); } From d6f3b2910eff337dfa57901e0b4c6b22179e5a2b Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Thu, 13 Apr 2017 15:16:33 +0200 Subject: [PATCH 092/122] Exact match for numbers (without scoring) This is a first version of exact match queries for numbers. It is the basis for range queries. --- repl-tests/ranges.noise | 33 +++++++++++++++++ src/filters.rs | 79 ++++++++++++++++++++++++++++++++++++++++- src/json_shred.rs | 72 ++++++++++++++++++++++++++++--------- src/key_builder.rs | 25 +++++++++++++ src/parser.rs | 45 ++++++++++++++++------- 5 files changed, 224 insertions(+), 30 deletions(-) create mode 100644 repl-tests/ranges.noise diff --git a/repl-tests/ranges.noise b/repl-tests/ranges.noise new file mode 100644 index 0000000..212429f --- /dev/null +++ b/repl-tests/ranges.noise @@ -0,0 +1,33 @@ +# Test for less and greater than + +drop target/tests/querytestranges; +create target/tests/querytestranges; + + +add {"_id":"one", "A":12}; +"one" +add {"_id":"two", "A":12}; +"two" +add {"_id":"three", "numberarray": [30, 60, 90]}; +"three" + + +# Exact match number + +find {A: ==12}; +[ +"one", +"two" +] + +find {numberarray: [==60]}; +[ +"three" +] + +del one; +ok +find {A: ==12}; +[ +"two" +] diff --git a/src/filters.rs b/src/filters.rs index db9e370..b38aa1b 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -1,4 +1,4 @@ -use std::str; +use std::{mem, str}; use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::HashSet; @@ -298,6 +298,83 @@ impl QueryRuntimeFilter for ExactMatchFilter { } } +pub struct RangeFilter { + iter: DBIterator, + kb: KeyBuilder, + min: Option, + max: Option, + keypath: String, +} + +impl RangeFilter { + pub fn new(snapshot: &Snapshot, kb: KeyBuilder, min: Option, max: Option) -> RangeFilter { + RangeFilter { + iter: snapshot.new_iterator(), + kb: kb, + min: min, + max: max, + // The keypath we use to seek to the correct key within RocksDB + keypath: String::new(), + } + } +} + +impl QueryRuntimeFilter for RangeFilter { + fn first_result(&mut self, start: &DocResult) -> Option { + let mut value_key = self.kb.number_key_without_arraypath(start.seq); + + // NOTE vmx 2017-04-13: Iterating over keys is really similar to the + // `DocResultIterator` in `snapshot.rs`. It should probablly be unified. + self.iter.set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + KeyBuilder::truncate_to_keypathword(&mut value_key); + self.keypath = value_key; + self.next_result() + } + + fn next_result(&mut self) -> Option { + while let Some((key, value)) = self.iter.next() { + if !key.starts_with(self.keypath.as_bytes()) { + // we passed the key path we are interested in. nothing left to do + return None + } + + let key_str = unsafe{ str::from_utf8_unchecked(&key) }; + let number = unsafe{ + let array = *(value[..].as_ptr() as *const [_; 8]); + mem::transmute::<[u8; 8], f64>(array) + }; + + match (self.min, self.max) { + (Some(min), Some(max)) => { + if number >= min && number <= max { + let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + return Some(dr); + } + // Keep looping and move on to the next key + }, + _ => { + panic!("Only closed ranges are supported atm"); + } + } + } + None + } + + // TODO vmx 2017-04-13: Scoring is not implemented yet + fn prepare_relevancy_scoring(&mut self, _qsi: &mut QueryScoringInfo) { + } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } +} + + pub struct DistanceFilter { filters: Vec, current_filter: usize, diff --git a/src/json_shred.rs b/src/json_shred.rs index 4761174..3061aa6 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -50,6 +50,28 @@ impl Shredder { } } + fn add_number_entries(kb: &mut KeyBuilder, number: &[u8], docseq: u64, + batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { + // Add/delete the key that is used for range lookups + let number_key = kb.number_key(docseq); + if delete { + try!(batch.delete(&number_key.as_bytes())); + } else { + // The number contains the `f` prefix + try!(batch.put(&number_key.as_bytes(), &number[1..])); + } + + // Add/elete the key-value pair of the shredded original JSON + let value_key = kb.value_key(docseq); + if delete { + try!(batch.delete(&value_key.as_bytes())); + } else { + try!(batch.put(&value_key.into_bytes(), &number.as_ref())); + } + + Ok(()) + } + fn add_stemmed_entries(kb: &mut KeyBuilder, text: &str, docseq: u64, batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { let stems = Stems::new(text); @@ -187,11 +209,17 @@ impl Shredder { for (key, value) in &self.existing_key_value_to_delete { self.kb.clear(); self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); - if value[0] as char == 's' { - let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); - } else { - try!(batch.delete(key.as_bytes())); + match value[0] as char { + 's' => { + let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); + }, + 'f' => { + try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); + }, + _ => { + try!(batch.delete(key.as_bytes())); + }, } } self.existing_key_value_to_delete = BTreeMap::new(); @@ -199,12 +227,18 @@ impl Shredder { for (key, value) in &self.shredded_key_values { self.kb.clear(); self.kb.parse_value_key_path_only(&key); - if value[0] as char == 's' { - let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, false)); - } else { - let key = self.kb.value_key(seq); - try!(batch.put(&key.as_bytes(), &value.as_ref())); + match value[0] as char { + 's' => { + let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, false)); + }, + 'f' => { + try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, false)); + }, + _ => { + let key = self.kb.value_key(seq); + try!(batch.put(&key.as_bytes(), &value.as_ref())); + }, } } self.shredded_key_values = BTreeMap::new(); @@ -224,11 +258,17 @@ impl Shredder { for (key, value) in existing.into_iter() { self.kb.clear(); self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); - if value[0] as char == 's' { - let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; - try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); - } else { - try!(batch.delete(&key.into_bytes())); + match value[0] as char { + 's' => { + let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); + }, + 'f' => { + try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); + }, + _ => { + try!(batch.delete(&key.as_bytes())); + }, } } let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); diff --git a/src/key_builder.rs b/src/key_builder.rs index 6ec7365..1389fa0 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -68,6 +68,27 @@ impl KeyBuilder { str } + // Build the keypath for a number primitive without the arraypath + pub fn number_key_without_arraypath(&self, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push('f'); + for segment in &self.keypath { + string.push_str(&segment); + }; + string.push('#'); + string.push_str(&seq.to_string()); + string + } + + // Build the index key that corresponds to a number primitive + pub fn number_key(&self, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push_str(&self.number_key_without_arraypath(seq)); + KeyBuilder::add_arraypath(&mut string, &self.arraypath); + string + } + + /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath /// built up internally. pub fn stemmed_word_key(&self, word: &str, seq: u64) -> String { @@ -114,6 +135,10 @@ impl KeyBuilder { KeyBuilder::add_arraypath(keypathword, &dr.arraypath); } + // NOTE vmx 2017-04-13: I find `keypathword` not really descriptive. I would call the + // path without the Internal Id simply "keypath" and the one with and Internal Id + // "keypath_iid". + /// Truncates key to keypath only pub fn truncate_to_keypathword(stemmed_word_key: &mut String) { let n = stemmed_word_key.rfind("#").unwrap(); stemmed_word_key.truncate(n + 1); diff --git a/src/parser.rs b/src/parser.rs index d8ae6b7..4ca5b4d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -15,7 +15,7 @@ use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, ReturnPath}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, - NotFilter}; + NotFilter, RangeFilter}; use snapshot::Snapshot; @@ -635,18 +635,29 @@ impl<'a, 'c> Parser<'a, 'c> { return Ok(Box::new(NotFilter::new(try!(self.compare()), self.kb.arraypath_len()))); } if self.consume("==") { - let literal = try!(self.must_consume_string_literal()); + let json = try!(self.must_consume_json_primitive()); let boost = try!(self.consume_boost()); - let mut filters: Vec = Vec::new(); - { - let stems = Stems::new(&literal); - for stem in stems { - filters.push(StemmedWordPosFilter::new(&self.snapshot, - &stem.stemmed, &self.kb, boost)); - } - } - let filter = StemmedPhraseFilter::new(filters); - Ok(Box::new(ExactMatchFilter::new(&self.snapshot, filter, self.kb.clone(), literal, true))) + let filter: Box = match json { + JsonValue::String(literal) => { + let mut filters: Vec = Vec::new(); + { + let stems = Stems::new(&literal); + for stem in stems { + filters.push(StemmedWordPosFilter::new(&self.snapshot, + &stem.stemmed, + &self.kb, + boost)); + } + } + let filter = StemmedPhraseFilter::new(filters); + Box::new(ExactMatchFilter::new(&self.snapshot, filter, self.kb.clone(), literal, true)) + }, + JsonValue::Number(num) => { + Box::new(RangeFilter::new(&self.snapshot, self.kb.clone(), Some(num), Some(num))) + }, + _ => panic!("Comparison on other JSON types is not yet implemented!"), + }; + Ok(filter) } else if self.consume("~=") { // regular search let literal = try!(self.must_consume_string_literal()); @@ -1005,6 +1016,14 @@ impl<'a, 'c> Parser<'a, 'c> { } } + fn must_consume_json_primitive(&mut self) -> Result { + if let Some(json) = try!(self.json_primitive()) { + Ok(json) + } else { + Err(Error::Parse("Expected JSON primitive.".to_string())) + } + } + /// JSON primites are strings, numbers, booleans and null fn json_primitive(&mut self) -> Result, Error> { if let Some(string) = try!(self.consume_string_literal()) { @@ -1143,4 +1162,4 @@ mod tests { let mut parser = Parser::new(&query, snapshot); assert!(parser.find().is_err()); } -} \ No newline at end of file +} From 594b011ff3ec31a5a05aea36dc66184ab3ee40c1 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 18 Apr 2017 15:24:58 +0200 Subject: [PATCH 093/122] Add less than and greater than (equal) filter for numbers It's now possible to query on open ranges, e.g.: find {A: >=12}; --- repl-tests/ranges.noise | 73 +++++++++++++++++++++++++++++++++++++++++ src/filters.rs | 42 ++++++++++++++++-------- src/parser.rs | 32 ++++++++++++++++-- 3 files changed, 130 insertions(+), 17 deletions(-) diff --git a/repl-tests/ranges.noise b/repl-tests/ranges.noise index 212429f..54ed270 100644 --- a/repl-tests/ranges.noise +++ b/repl-tests/ranges.noise @@ -10,6 +10,10 @@ add {"_id":"two", "A":12}; "two" add {"_id":"three", "numberarray": [30, 60, 90]}; "three" +add {"_id":"four", "A":-3}; +"four" +add {"_id":"five", "A":35}; +"five" # Exact match number @@ -31,3 +35,72 @@ find {A: ==12}; [ "two" ] + + +# Greater than (equal) on number + +find {A: >20}; +[ +"five" +] + +find {A: >-5}; +[ +"two", +"four", +"five" +] + +find {numberarray: [>40]}; +[ +"three" +] + +find {A: >35}; +[] + +find {A: >=35}; +[ +"five" +] + + +# Less than (equal) on number + +find {A: <-1}; +[ +"four" +] + +find {A: <20}; +[ +"two", +"four" +] + +find {numberarray: [<70]}; +[ +"three" +] + +find {A: <-3}; +[] + +find {A: <=-3}; +[ +"four" +] + + +# Range on number + +find {A: >10, A: <20}; +[ +"two" +] + +find {A: >-10, A: <20}; +[ +"two", +"four" +] diff --git a/src/filters.rs b/src/filters.rs index b38aa1b..cfdf27a 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -23,6 +23,11 @@ pub trait QueryRuntimeFilter { } +pub enum RangeOperator { + Inclusive(f64), + Exclusive(f64), +} + pub struct StemmedWordFilter { iter: DocResultIterator, scorer: Scorer, @@ -301,13 +306,16 @@ impl QueryRuntimeFilter for ExactMatchFilter { pub struct RangeFilter { iter: DBIterator, kb: KeyBuilder, - min: Option, - max: Option, + min: Option, + max: Option, keypath: String, } impl RangeFilter { - pub fn new(snapshot: &Snapshot, kb: KeyBuilder, min: Option, max: Option) -> RangeFilter { + pub fn new(snapshot: &Snapshot, + kb: KeyBuilder, + min: Option, + max: Option) -> RangeFilter { RangeFilter { iter: snapshot.new_iterator(), kb: kb, @@ -345,18 +353,24 @@ impl QueryRuntimeFilter for RangeFilter { mem::transmute::<[u8; 8], f64>(array) }; - match (self.min, self.max) { - (Some(min), Some(max)) => { - if number >= min && number <= max { - let dr = KeyBuilder::parse_doc_result_from_key(&key_str); - return Some(dr); - } - // Keep looping and move on to the next key - }, - _ => { - panic!("Only closed ranges are supported atm"); - } + let min_condition = match self.min { + Some(RangeOperator::Inclusive(min)) => number >= min, + Some(RangeOperator::Exclusive(min)) => number > min, + // No condition was given => it always matches + None => true, + }; + let max_condition = match self.max { + Some(RangeOperator::Inclusive(max)) => number <= max, + Some(RangeOperator::Exclusive(max)) => number < max, + // No condition was given => it always matches + None => true, + }; + + if min_condition && max_condition { + let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + return Some(dr); } + // Else: No match => KKeep looping and move on to the next key } None } diff --git a/src/parser.rs b/src/parser.rs index 4ca5b4d..59f5a76 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -15,7 +15,7 @@ use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, ReturnPath}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, - NotFilter, RangeFilter}; + NotFilter, RangeFilter, RangeOperator}; use snapshot::Snapshot; @@ -514,6 +514,21 @@ impl<'a, 'c> Parser<'a, 'c> { Ok(Some(lit)) } + fn consume_range_operator(&mut self) -> Result { + let inclusive = self.consume("="); + let json = try!(self.must_consume_json_primitive()); + match json { + JsonValue::Number(num) => { + if inclusive { + Ok(RangeOperator::Inclusive(num)) + } else { + Ok(RangeOperator::Exclusive(num)) + } + }, + _ => panic!("Range operator on other JSON types is not yet implemented!"), + } + } + fn find<'b>(&'b mut self) -> Result, Error> { if !self.consume("find") { return Err(Error::Parse("Missing 'find' keyword".to_string())); @@ -653,9 +668,12 @@ impl<'a, 'c> Parser<'a, 'c> { Box::new(ExactMatchFilter::new(&self.snapshot, filter, self.kb.clone(), literal, true)) }, JsonValue::Number(num) => { - Box::new(RangeFilter::new(&self.snapshot, self.kb.clone(), Some(num), Some(num))) + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::Inclusive(num)), + Some(RangeOperator::Inclusive(num)))) }, - _ => panic!("Comparison on other JSON types is not yet implemented!"), + _ => panic!("Exact match on other JSON types is not yet implemented!"), }; Ok(filter) } else if self.consume("~=") { @@ -706,6 +724,14 @@ impl<'a, 'c> Parser<'a, 'c> { 0 => panic!("Cannot create a DistanceFilter"), _ => Ok(Box::new(DistanceFilter::new(filters, word_distance as u32))), } + } else if self.consume(">") { + let min = try!(self.consume_range_operator()); + let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), Some(min), None); + Ok(Box::new(filter)) + } else if self.consume("<") { + let max = try!(self.consume_range_operator()); + let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), None, Some(max)); + Ok(Box::new(filter)) } else { Err(Error::Parse("Expected comparison operator".to_string())) } From 1e364214a8f27f4dd94a2253e281be5dd81db5cc Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Wed, 19 Apr 2017 13:08:40 +0200 Subject: [PATCH 094/122] BREAKING CHANGE: Rename count_key prefix from 'F' to 'C' It's renamed so that 'F' can be used for "true" values, so that it is consistent with the prefixed used in other places. This is a breaking change, you need to recreate your index. --- src/index.rs | 4 ++-- src/key_builder.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/index.rs b/src/index.rs index 9c8d847..70da6aa 100644 --- a/src/index.rs +++ b/src/index.rs @@ -276,7 +276,7 @@ impl Index { } fn compaction_filter(_level: u32, key: &[u8], value: &[u8]) -> CompactionDecision { - if !(key[0] as char == 'F' || key[0] as char == 'K') { + if !(key[0] as char == 'C' || key[0] as char == 'K') { return CompactionDecision::Keep; } if 0 == Index::convert_bytes_to_i32(&value) { @@ -300,7 +300,7 @@ impl Index { existing_val: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - if !(new_key[0] as char == 'F' || new_key[0] as char == 'K') { + if !(new_key[0] as char == 'C' || new_key[0] as char == 'K') { panic!("unknown key type to merge!"); } diff --git a/src/key_builder.rs b/src/key_builder.rs index 1389fa0..96181d4 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -43,7 +43,7 @@ impl KeyBuilder { pub fn keypathword_count_key(&self, word: &str) -> String { let mut string = String::with_capacity(100); - string.push('F'); + string.push('C'); for segment in &self.keypath { string.push_str(&segment); } From 12abb2ec299f46f318f54971f7a24ec49dfb0d10 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Tue, 18 Apr 2017 17:01:31 +0200 Subject: [PATCH 095/122] Don't repeat yourself refactoring This change doesn't change any functionality it's a pure refactoring. --- src/json_shred.rs | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/src/json_shred.rs b/src/json_shred.rs index 3061aa6..67b1424 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -61,14 +61,6 @@ impl Shredder { try!(batch.put(&number_key.as_bytes(), &number[1..])); } - // Add/elete the key-value pair of the shredded original JSON - let value_key = kb.value_key(docseq); - if delete { - try!(batch.delete(&value_key.as_bytes())); - } else { - try!(batch.put(&value_key.into_bytes(), &number.as_ref())); - } - Ok(()) } @@ -119,17 +111,6 @@ impl Shredder { try!(batch.merge(&key.into_bytes(), one_enc_bytes.get_ref())); } - let key = kb.value_key(docseq); - if delete { - try!(batch.delete(&key.into_bytes())); - } else { - let mut buffer = String::with_capacity(text.len() + 1); - buffer.push('s'); - buffer.push_str(&text); - - try!(batch.put(&key.into_bytes(), &buffer.as_bytes())); - } - Ok(()) } @@ -217,10 +198,9 @@ impl Shredder { 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); }, - _ => { - try!(batch.delete(key.as_bytes())); - }, + _ => {}, } + try!(batch.delete(&key.as_bytes())); } self.existing_key_value_to_delete = BTreeMap::new(); @@ -235,11 +215,10 @@ impl Shredder { 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, false)); }, - _ => { - let key = self.kb.value_key(seq); - try!(batch.put(&key.as_bytes(), &value.as_ref())); - }, + _ => {}, } + let key = self.kb.value_key(seq); + try!(batch.put(&key.as_bytes(), &value.as_ref())); } self.shredded_key_values = BTreeMap::new(); @@ -266,10 +245,9 @@ impl Shredder { 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); }, - _ => { - try!(batch.delete(&key.as_bytes())); - }, + _ => {}, } + try!(batch.delete(&key.as_bytes())); } let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); try!(batch.delete(&key.into_bytes())); From e8c8982e58226eca1a0e8cdfe2bf328577ea8c16 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Thu, 20 Apr 2017 00:01:19 +0200 Subject: [PATCH 096/122] Extend comparison function to other types The comparator was working on strings only, it now works with other types like numbers, booleans and null as well. --- src/filters.rs | 2 +- src/index.rs | 3 ++- src/key_builder.rs | 24 +++++++++++++----------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/filters.rs b/src/filters.rs index cfdf27a..104649a 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -329,7 +329,7 @@ impl RangeFilter { impl QueryRuntimeFilter for RangeFilter { fn first_result(&mut self, start: &DocResult) -> Option { - let mut value_key = self.kb.number_key_without_arraypath(start.seq); + let mut value_key = self.kb.number_key(start.seq); // NOTE vmx 2017-04-13: Iterating over keys is really similar to the // `DocResultIterator` in `snapshot.rs`. It should probablly be unified. diff --git a/src/index.rs b/src/index.rs index 70da6aa..9437369 100644 --- a/src/index.rs +++ b/src/index.rs @@ -287,7 +287,8 @@ impl Index { } fn compare_keys(a: &[u8], b: &[u8]) -> Ordering { - if a[0] == 'W' as u8 && b[0] == 'W' as u8 { + let value_prefixes = ['W', 'f', 'T', 'F', 'N']; + if value_prefixes.contains(&(a[0] as char)) && value_prefixes.contains(&(b[0] as char)) { let astr = unsafe {str::from_utf8_unchecked(&a)}; let bstr = unsafe {str::from_utf8_unchecked(&b)}; KeyBuilder::compare_keys(astr, bstr) diff --git a/src/key_builder.rs b/src/key_builder.rs index 96181d4..d030ce4 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -68,8 +68,8 @@ impl KeyBuilder { str } - // Build the keypath for a number primitive without the arraypath - pub fn number_key_without_arraypath(&self, seq: u64) -> String { + /// Build the index key that corresponds to a number primitive + pub fn number_key(&self, seq: u64) -> String { let mut string = String::with_capacity(100); string.push('f'); for segment in &self.keypath { @@ -77,13 +77,7 @@ impl KeyBuilder { }; string.push('#'); string.push_str(&seq.to_string()); - string - } - // Build the index key that corresponds to a number primitive - pub fn number_key(&self, seq: u64) -> String { - let mut string = String::with_capacity(100); - string.push_str(&self.number_key_without_arraypath(seq)); KeyBuilder::add_arraypath(&mut string, &self.arraypath); string } @@ -357,14 +351,22 @@ impl KeyBuilder { } pub fn compare_keys(akey: &str, bkey: &str) -> Ordering { - debug_assert!(akey.starts_with('W')); - debug_assert!(bkey.starts_with('W')); + debug_assert!(akey.starts_with('W') || + akey.starts_with('f') || + akey.starts_with('T') || + akey.starts_with('F') || + akey.starts_with('N')); + debug_assert!(bkey.starts_with('W') || + bkey.starts_with('f') || + bkey.starts_with('T') || + bkey.starts_with('F') || + bkey.starts_with('N')); let (apath_str, aseq_str, aarraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); let (bpath_str, bseq_str, barraypath_str) = KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); - match apath_str[1..].cmp(&bpath_str[1..]) { + match apath_str[0..].cmp(&bpath_str[0..]) { Ordering::Less => Ordering::Less, Ordering::Greater => Ordering::Greater, Ordering::Equal => { From c9d5efb933e58a39bb9124030e05befbbc8b9cf0 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Thu, 20 Apr 2017 00:26:51 +0200 Subject: [PATCH 097/122] Add exact match filter for booleans and null It's now possible to do exact match queries on booleans and null. Examples: find {A: ==true}; find {A: ==false}; find {A: ==null}; find {boolarray: [==true]}; --- repl-tests/ranges.noise | 34 ++++++++++++++++++++++++++++++++++ src/filters.rs | 38 ++++++++++++++++++++++++++++++++++++-- src/json_shred.rs | 34 ++++++++++++++++++++++++++++++++++ src/key_builder.rs | 14 ++++++++++++++ src/parser.rs | 18 ++++++++++++++++++ 5 files changed, 136 insertions(+), 2 deletions(-) diff --git a/repl-tests/ranges.noise b/repl-tests/ranges.noise index 54ed270..e1bb0c0 100644 --- a/repl-tests/ranges.noise +++ b/repl-tests/ranges.noise @@ -14,6 +14,16 @@ add {"_id":"four", "A":-3}; "four" add {"_id":"five", "A":35}; "five" +add {"_id":"six", "A":true}; +"six" +add {"_id":"seven", "A":false}; +"seven" +add {"_id":"eight", "A":null}; +"eight" +add {"_id":"nine", "boolarray":[true, true]}; +"nine" +add {"_id":"ten", "boolarray":[false, true]}; +"ten" # Exact match number @@ -104,3 +114,27 @@ find {A: >-10, A: <20}; "two", "four" ] + + +# Exact match boolean + +find {A: ==true}; +[ +"six" +] + +find {A: ==false}; +[ +"seven" +] + +find {A: ==null}; +[ +"eight" +] + +find {boolarray: [==true]}; +[ +"nine", +"ten" +] diff --git a/src/filters.rs b/src/filters.rs index 104649a..a6a916b 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -23,9 +23,15 @@ pub trait QueryRuntimeFilter { } +#[derive(PartialEq)] pub enum RangeOperator { Inclusive(f64), Exclusive(f64), + // For booleans and null only exact match makes sense, hence no inclusive/exclusive + // boundaries are needed + True, + False, + Null, } pub struct StemmedWordFilter { @@ -329,8 +335,24 @@ impl RangeFilter { impl QueryRuntimeFilter for RangeFilter { fn first_result(&mut self, start: &DocResult) -> Option { - let mut value_key = self.kb.number_key(start.seq); - + let mut value_key = { + // `min` and `max` have the save type, so picking one is OK + let range_operator = self.min.as_ref().or(self.max.as_ref()).unwrap(); + match range_operator { + &RangeOperator::Inclusive(_) | &RangeOperator::Exclusive(_) => { + self.kb.number_key(start.seq) + }, + &RangeOperator::True => { + self.kb.bool_null_key('T', start.seq) + }, + &RangeOperator::False => { + self.kb.bool_null_key('F', start.seq) + }, + &RangeOperator::Null => { + self.kb.bool_null_key('N', start.seq) + } + } + }; // NOTE vmx 2017-04-13: Iterating over keys is really similar to the // `DocResultIterator` in `snapshot.rs`. It should probablly be unified. self.iter.set_mode(IteratorMode::From(value_key.as_bytes(), @@ -348,6 +370,16 @@ impl QueryRuntimeFilter for RangeFilter { } let key_str = unsafe{ str::from_utf8_unchecked(&key) }; + + // The key already matched, hence it's a valid doc result. Return it. + if self.min == Some(RangeOperator::True) + || self.min == Some(RangeOperator::False) + || self.min == Some(RangeOperator::Null) { + let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + return Some(dr); + } + // Else it's a range query on numbers + let number = unsafe{ let array = *(value[..].as_ptr() as *const [_; 8]); mem::transmute::<[u8; 8], f64>(array) @@ -358,12 +390,14 @@ impl QueryRuntimeFilter for RangeFilter { Some(RangeOperator::Exclusive(min)) => number > min, // No condition was given => it always matches None => true, + _ => panic!("Can't happen, it returns early on the other types"), }; let max_condition = match self.max { Some(RangeOperator::Inclusive(max)) => number <= max, Some(RangeOperator::Exclusive(max)) => number < max, // No condition was given => it always matches None => true, + _ => panic!("Can't happen, it returns early on the other types"), }; if min_condition && max_condition { diff --git a/src/json_shred.rs b/src/json_shred.rs index 67b1424..28d1ccd 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -64,6 +64,19 @@ impl Shredder { Ok(()) } + fn add_bool_null_entries(kb: &mut KeyBuilder, prefix: char, docseq: u64, + batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { + let key = kb.bool_null_key(prefix, docseq); + if delete { + try!(batch.delete(&key.as_bytes())); + } else { + // No need to store any value as the key already contains it + try!(batch.put(&key.as_bytes(), &[])); + } + + Ok(()) + } + fn add_stemmed_entries(kb: &mut KeyBuilder, text: &str, docseq: u64, batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { let stems = Stems::new(text); @@ -198,6 +211,13 @@ impl Shredder { 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); }, + 'T' | 'F' | 'N' => { + try!(Shredder::add_bool_null_entries(&mut self.kb, + value[0] as char, + seq, + batch, + true)); + }, _ => {}, } try!(batch.delete(&key.as_bytes())); @@ -215,6 +235,13 @@ impl Shredder { 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, false)); }, + 'T' | 'F' | 'N' => { + try!(Shredder::add_bool_null_entries(&mut self.kb, + value[0] as char, + seq, + batch, + false)); + }, _ => {}, } let key = self.kb.value_key(seq); @@ -245,6 +272,13 @@ impl Shredder { 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); }, + 'T' | 'F' | 'N' => { + try!(Shredder::add_bool_null_entries(&mut self.kb, + value[0] as char, + seq, + batch, + true)); + }, _ => {}, } try!(batch.delete(&key.as_bytes())); diff --git a/src/key_builder.rs b/src/key_builder.rs index d030ce4..749ec4c 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -82,6 +82,20 @@ impl KeyBuilder { string } + /// Build the index key that corresponds to a true, false or nulla primitive + pub fn bool_null_key(&self, prefix: char, seq: u64) -> String { + let mut string = String::with_capacity(100); + string.push(prefix); + for segment in &self.keypath { + string.push_str(&segment); + }; + string.push('#'); + string.push_str(&seq.to_string()); + + KeyBuilder::add_arraypath(&mut string, &self.arraypath); + string + } + /// Builds a stemmed word key for the input word and seq, using the key_path and arraypath /// built up internally. diff --git a/src/parser.rs b/src/parser.rs index 59f5a76..9295a0d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -673,6 +673,24 @@ impl<'a, 'c> Parser<'a, 'c> { Some(RangeOperator::Inclusive(num)), Some(RangeOperator::Inclusive(num)))) }, + JsonValue::True => { + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::True), + Some(RangeOperator::True))) + }, + JsonValue::False => { + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::False), + Some(RangeOperator::False))) + }, + JsonValue::Null => { + Box::new(RangeFilter::new(&self.snapshot, + self.kb.clone(), + Some(RangeOperator::Null), + Some(RangeOperator::Null))) + }, _ => panic!("Exact match on other JSON types is not yet implemented!"), }; Ok(filter) From 955a5ff5393eb5be67300b5984a90e5ce8de24d8 Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Thu, 20 Apr 2017 00:39:58 +0200 Subject: [PATCH 098/122] Run repl tests ordered by last modified date The order of running the repl tests shouldn't matter, hence it's a good idea to not running them always in the same order. To make things easier to debug, sort the tests by last modified date. If you edit a test (which is likely if you debug it), it will run last. So it's easy to see its output if there is any. --- tests/repl_tests.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/repl_tests.rs b/tests/repl_tests.rs index 8e4671b..20eefb3 100644 --- a/tests/repl_tests.rs +++ b/tests/repl_tests.rs @@ -20,8 +20,11 @@ fn test_repl() { test_dir.push("repl-tests"); let mut failures = 0; let mut total = 0; - for entry in fs::read_dir(test_dir).unwrap() { - let mut path = entry.unwrap().path(); + // Sort files by last modified date to make debugging easier + let mut entries: Vec<_> = fs::read_dir(test_dir).unwrap().map(|r| r.unwrap()).collect(); + entries.sort_by_key(|entry| entry.metadata().unwrap().modified().unwrap()); + for entry in entries { + let mut path = entry.path(); if path.extension().unwrap().to_str().unwrap() != "noise" { continue; } @@ -57,4 +60,4 @@ fn test_repl() { if failures > 0 { panic!("Failed {} tests in repl-test out of {}", failures, total); } -} \ No newline at end of file +} From e905abff5681d1ab24e2d64ca8322ebc6bcab6c8 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 24 Apr 2017 16:15:16 -0700 Subject: [PATCH 099/122] rustfmt run on source files rustfmt is a tool that will modify rust source code to make it adhere to a code formatting standard. It also has plugins for various popular code editors to automatically format to those standards as you write code. Using it ensures more uniform and readable source code. It has been run here and where it could not automatically incorrectly formatted code, I corrected those places manually. --- src/aggregates.rs | 133 ++++++++++++--------- src/filters.rs | 248 ++++++++++++++++++++++----------------- src/index.rs | 144 ++++++++++++----------- src/json_shred.rs | 196 +++++++++++++++++-------------- src/json_value.rs | 42 ++++--- src/key_builder.rs | 73 ++++++------ src/main.rs | 10 +- src/parser.rs | 279 ++++++++++++++++++++++++++------------------ src/query.rs | 186 ++++++++++++++++------------- src/repl.rs | 12 +- src/returnable.rs | 93 +++++++++------ src/snapshot.rs | 126 ++++++++++---------- src/stems.rs | 122 ++++++++++++------- tests/repl_tests.rs | 33 ++++-- tests/rocksdb.rs | 2 +- 15 files changed, 977 insertions(+), 722 deletions(-) diff --git a/src/aggregates.rs b/src/aggregates.rs index 8bd054f..8a7e962 100644 --- a/src/aggregates.rs +++ b/src/aggregates.rs @@ -21,13 +21,13 @@ pub enum AggregateFun { pub struct AggregateFunImpls { // Initalizes for a computing the aggregate action (optional) - pub init: Option JsonValue>, + pub init: Option JsonValue>, // The actual aggregate action function - pub action: fn (&mut JsonValue, JsonValue, &JsonValue), + pub action: fn(&mut JsonValue, JsonValue, &JsonValue), // extracts the final aggregate value (optional) - pub extract: Option, + pub extract: Option, } impl AggregateFun { @@ -35,56 +35,76 @@ impl AggregateFun { match self { &AggregateFun::GroupAsc => panic!("cannot get aggregate fun for grouping!"), &AggregateFun::GroupDesc => panic!("cannot get aggregate fun for grouping!"), - &AggregateFun::Sum => AggregateFunImpls{ - init: Some(AggregateFun::sum_init), - action: AggregateFun::sum, - extract: None, - }, - &AggregateFun::Max => AggregateFunImpls{ - init: None, - action: AggregateFun::max, - extract: None, - }, - &AggregateFun::Min => AggregateFunImpls{ - init: None, - action: AggregateFun::min, - extract: None, - }, - &AggregateFun::MaxArray => AggregateFunImpls{ - init: Some(AggregateFun::max_array_init), - action: AggregateFun::max_array, - extract: None, - }, - &AggregateFun::MinArray => AggregateFunImpls{ - init: Some(AggregateFun::min_array_init), - action: AggregateFun::min_array, - extract: None, - }, - &AggregateFun::Array => AggregateFunImpls{ - init: Some(AggregateFun::array_init), - action: AggregateFun::array, - extract: None, - }, - &AggregateFun::ArrayFlat => AggregateFunImpls{ - init: Some(AggregateFun::array_flat_init), - action: AggregateFun::array_flat, - extract: None, - }, - &AggregateFun::Concat => AggregateFunImpls{ - init: Some(AggregateFun::concat_init), - action: AggregateFun::concat, - extract: None, - }, - &AggregateFun::Avg => AggregateFunImpls{ - init: Some(AggregateFun::avg_init), - action: AggregateFun::avg, - extract: Some(AggregateFun::avg_final), - }, - &AggregateFun::Count => AggregateFunImpls{ - init: Some(AggregateFun::count_init), - action: AggregateFun::count, - extract: None, - }, + &AggregateFun::Sum => { + AggregateFunImpls { + init: Some(AggregateFun::sum_init), + action: AggregateFun::sum, + extract: None, + } + } + &AggregateFun::Max => { + AggregateFunImpls { + init: None, + action: AggregateFun::max, + extract: None, + } + } + &AggregateFun::Min => { + AggregateFunImpls { + init: None, + action: AggregateFun::min, + extract: None, + } + } + &AggregateFun::MaxArray => { + AggregateFunImpls { + init: Some(AggregateFun::max_array_init), + action: AggregateFun::max_array, + extract: None, + } + } + &AggregateFun::MinArray => { + AggregateFunImpls { + init: Some(AggregateFun::min_array_init), + action: AggregateFun::min_array, + extract: None, + } + } + &AggregateFun::Array => { + AggregateFunImpls { + init: Some(AggregateFun::array_init), + action: AggregateFun::array, + extract: None, + } + } + &AggregateFun::ArrayFlat => { + AggregateFunImpls { + init: Some(AggregateFun::array_flat_init), + action: AggregateFun::array_flat, + extract: None, + } + } + &AggregateFun::Concat => { + AggregateFunImpls { + init: Some(AggregateFun::concat_init), + action: AggregateFun::concat, + extract: None, + } + } + &AggregateFun::Avg => { + AggregateFunImpls { + init: Some(AggregateFun::avg_init), + action: AggregateFun::avg, + extract: Some(AggregateFun::avg_final), + } + } + &AggregateFun::Count => { + AggregateFunImpls { + init: Some(AggregateFun::count_init), + action: AggregateFun::count, + extract: None, + } + } } } @@ -100,12 +120,12 @@ impl AggregateFun { if let &mut JsonValue::Number(ref mut existing) = existing { *existing += new; } - }, + } JsonValue::Array(vec) => { for v in vec { AggregateFun::sum(existing, v, _user_arg); } - }, + } _ => (), } } @@ -243,7 +263,7 @@ impl AggregateFun { // can't happen but compiler need this here 1.0 }; - + avg = (avg * count + new) / (count + 1.0); count += 1.0; array[0] = JsonValue::Number(avg); @@ -290,4 +310,3 @@ impl AggregateFun { } } } - diff --git a/src/filters.rs b/src/filters.rs index a6a916b..ab7a4bd 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -14,10 +14,10 @@ pub trait QueryRuntimeFilter { fn first_result(&mut self, start: &DocResult) -> Option; fn next_result(&mut self) -> Option; fn prepare_relevancy_scoring(&mut self, qsi: &mut QueryScoringInfo); - + /// returns error is a double negation is detected fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error>; - + /// return true if filter or all subfilters are NotFilters fn is_all_not(&self) -> bool; } @@ -40,8 +40,11 @@ pub struct StemmedWordFilter { } impl StemmedWordFilter { - pub fn new(snapshot: &Snapshot, stemmed_word: &str, - kb: &KeyBuilder, boost: f32) -> StemmedWordFilter { + pub fn new(snapshot: &Snapshot, + stemmed_word: &str, + kb: &KeyBuilder, + boost: f32) + -> StemmedWordFilter { StemmedWordFilter { iter: snapshot.new_term_doc_result_iterator(stemmed_word, kb), scorer: snapshot.new_scorer(stemmed_word, kb, boost), @@ -74,7 +77,7 @@ impl QueryRuntimeFilter for StemmedWordFilter { fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { Ok(()) } - + fn is_all_not(&self) -> bool { false } @@ -88,9 +91,12 @@ pub struct StemmedWordPosFilter { } impl StemmedWordPosFilter { - pub fn new(snapshot: &Snapshot, stemmed_word: &str, - kb: &KeyBuilder, boost: f32) -> StemmedWordPosFilter { - StemmedWordPosFilter{ + pub fn new(snapshot: &Snapshot, + stemmed_word: &str, + kb: &KeyBuilder, + boost: f32) + -> StemmedWordPosFilter { + StemmedWordPosFilter { iter: snapshot.new_term_doc_result_iterator(stemmed_word, kb), scorer: snapshot.new_scorer(&stemmed_word, &kb, boost), } @@ -126,16 +132,16 @@ pub struct StemmedPhraseFilter { impl StemmedPhraseFilter { pub fn new(filters: Vec) -> StemmedPhraseFilter { assert!(filters.len() > 0); - StemmedPhraseFilter { - filters: filters, - } + StemmedPhraseFilter { filters: filters } } fn result(&mut self, base: Option<(DocResult, Vec)>) -> Option { - // this is the number of matches left before all terms match and we can return a result + // this is the number of matches left before all terms match and we can return a result let mut matches_left = self.filters.len() - 1; - if base.is_none() { return None; } + if base.is_none() { + return None; + } let (mut base_result, mut base_positions) = base.unwrap(); if matches_left == 0 { @@ -150,8 +156,10 @@ impl StemmedPhraseFilter { } let next = self.filters[current_filter].first_result(&base_result); - - if next.is_none() { return None; } + + if next.is_none() { + return None; + } let (next_result, next_positions) = next.unwrap(); if base_result == next_result { @@ -173,7 +181,9 @@ impl StemmedPhraseFilter { // we didn't match on phrase, so get next_result from first filter current_filter = 0; let next = self.filters[current_filter].next_result(); - if next.is_none() { return None; } + if next.is_none() { + return None; + } let (next_result, next_positions) = next.unwrap(); base_result = next_result; base_positions = next_positions; @@ -185,7 +195,9 @@ impl StemmedPhraseFilter { // 1st filter. current_filter = 0; let next = self.filters[current_filter].first_result(&next_result); - if next.is_none() { return None; } + if next.is_none() { + return None; + } let (next_result, next_positions) = next.unwrap(); base_result = next_result; base_positions = next_positions; @@ -217,7 +229,7 @@ impl QueryRuntimeFilter for StemmedPhraseFilter { fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { Ok(()) } - + fn is_all_not(&self) -> bool { false } @@ -233,13 +245,21 @@ pub struct ExactMatchFilter { } impl ExactMatchFilter { - pub fn new(snapshot: &Snapshot, filter: StemmedPhraseFilter, - kb: KeyBuilder, phrase: String, case_sensitive: bool) -> ExactMatchFilter { + pub fn new(snapshot: &Snapshot, + filter: StemmedPhraseFilter, + kb: KeyBuilder, + phrase: String, + case_sensitive: bool) + -> ExactMatchFilter { ExactMatchFilter { iter: snapshot.new_iterator(), filter: filter, kb: kb, - phrase: if case_sensitive {phrase} else {phrase.to_lowercase()}, + phrase: if case_sensitive { + phrase + } else { + phrase.to_lowercase() + }, case_sensitive: case_sensitive, } } @@ -248,8 +268,8 @@ impl ExactMatchFilter { loop { let value_key = self.kb.value_key_from_doc_result(&dr); - self.iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); if let Some((key, value)) = self.iter.next() { debug_assert!(key.starts_with(value_key.as_bytes())); // must always be true! @@ -303,7 +323,7 @@ impl QueryRuntimeFilter for ExactMatchFilter { fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { self.filter.check_double_not(parent_is_neg) } - + fn is_all_not(&self) -> bool { self.filter.is_all_not() } @@ -321,7 +341,8 @@ impl RangeFilter { pub fn new(snapshot: &Snapshot, kb: KeyBuilder, min: Option, - max: Option) -> RangeFilter { + max: Option) + -> RangeFilter { RangeFilter { iter: snapshot.new_iterator(), kb: kb, @@ -339,24 +360,17 @@ impl QueryRuntimeFilter for RangeFilter { // `min` and `max` have the save type, so picking one is OK let range_operator = self.min.as_ref().or(self.max.as_ref()).unwrap(); match range_operator { - &RangeOperator::Inclusive(_) | &RangeOperator::Exclusive(_) => { - self.kb.number_key(start.seq) - }, - &RangeOperator::True => { - self.kb.bool_null_key('T', start.seq) - }, - &RangeOperator::False => { - self.kb.bool_null_key('F', start.seq) - }, - &RangeOperator::Null => { - self.kb.bool_null_key('N', start.seq) - } + &RangeOperator::Inclusive(_) | + &RangeOperator::Exclusive(_) => self.kb.number_key(start.seq), + &RangeOperator::True => self.kb.bool_null_key('T', start.seq), + &RangeOperator::False => self.kb.bool_null_key('F', start.seq), + &RangeOperator::Null => self.kb.bool_null_key('N', start.seq), } }; // NOTE vmx 2017-04-13: Iterating over keys is really similar to the // `DocResultIterator` in `snapshot.rs`. It should probablly be unified. - self.iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); KeyBuilder::truncate_to_keypathword(&mut value_key); self.keypath = value_key; self.next_result() @@ -366,21 +380,20 @@ impl QueryRuntimeFilter for RangeFilter { while let Some((key, value)) = self.iter.next() { if !key.starts_with(self.keypath.as_bytes()) { // we passed the key path we are interested in. nothing left to do - return None + return None; } - let key_str = unsafe{ str::from_utf8_unchecked(&key) }; + let key_str = unsafe { str::from_utf8_unchecked(&key) }; // The key already matched, hence it's a valid doc result. Return it. - if self.min == Some(RangeOperator::True) - || self.min == Some(RangeOperator::False) - || self.min == Some(RangeOperator::Null) { + if self.min == Some(RangeOperator::True) || self.min == Some(RangeOperator::False) || + self.min == Some(RangeOperator::Null) { let dr = KeyBuilder::parse_doc_result_from_key(&key_str); return Some(dr); } // Else it's a range query on numbers - let number = unsafe{ + let number = unsafe { let array = *(value[..].as_ptr() as *const [_; 8]); mem::transmute::<[u8; 8], f64>(array) }; @@ -410,8 +423,7 @@ impl QueryRuntimeFilter for RangeFilter { } // TODO vmx 2017-04-13: Scoring is not implemented yet - fn prepare_relevancy_scoring(&mut self, _qsi: &mut QueryScoringInfo) { - } + fn prepare_relevancy_scoring(&mut self, _qsi: &mut QueryScoringInfo) {} fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { Ok(()) @@ -441,18 +453,21 @@ impl DistanceFilter { fn result(&mut self, base: Option<(DocResult, Vec)>) -> Option { // yes this code complex. I tried to break it up, but it wants to be like this. - // this is the number of matches left before all terms match and we can return a result + // this is the number of matches left before all terms match and we can return a result let mut matches_left = self.filters.len() - 1; - if base.is_none() { return None; } + if base.is_none() { + return None; + } let (mut base_result, positions) = base.unwrap(); // This contains tuples of word postions and the filter they came from, // sorted by word position. - let mut base_positions: Vec<(u32, usize)> = positions.iter() - .map(|pos|(*pos, self.current_filter)) - .collect(); - + let mut base_positions: Vec<(u32, usize)> = positions + .iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); + // distance is number of words between searched words. // add one to make calculating difference easier since abs(posa - posb) == distance + 1 let dis = self.distance + 1; @@ -463,16 +478,19 @@ impl DistanceFilter { } let next = self.filters[self.current_filter].first_result(&base_result); - - if next.is_none() { return None; } + + if next.is_none() { + return None; + } let (next_result, next_positions) = next.unwrap(); if base_result != next_result { // not same field, next_result becomes base_result. base_result = next_result; - base_positions = next_positions.iter() - .map(|pos| (*pos, self.current_filter)) - .collect(); + base_positions = next_positions + .iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); matches_left = self.filters.len() - 1; continue; @@ -487,14 +505,13 @@ impl DistanceFilter { for &pos in next_positions.iter() { // coud these lines be any longer? No they could not. let sub = pos.saturating_sub(dis); // underflows othewises - let start = match base_positions.binary_search_by_key(&(sub), - |&(pos2,_)| pos2) { + let start = match base_positions.binary_search_by_key(&(sub), |&(pos2, _)| pos2) { Ok(start) => start, Err(start) => start, }; - let end = match base_positions.binary_search_by_key(&(pos+dis), - |&(pos2,_)| pos2) { + let end = match base_positions.binary_search_by_key(&(pos + dis), + |&(pos2, _)| pos2) { Ok(end) => end, Err(end) => end, }; @@ -504,9 +521,9 @@ impl DistanceFilter { for &(_, filter_n) in base_positions[start..end].iter() { filters_encountered.insert(filter_n); } - + if filters_encountered.len() == self.filters.len() - matches_left { - // we encountered all the filters we can at this stage, + // we encountered all the filters we can at this stage, // so we should add them all to the new_positions_map for &(prev_pos, filter_n) in base_positions[start..end].iter() { new_positions_map.insert(prev_pos, filter_n); @@ -528,13 +545,16 @@ impl DistanceFilter { } // we didn't match on next_result, so get next_result on current filter let next = self.filters[self.current_filter].next_result(); - - if next.is_none() { return None; } + + if next.is_none() { + return None; + } let (next_result, next_positions) = next.unwrap(); base_result = next_result; - base_positions = next_positions.iter() - .map(|pos| (*pos, self.current_filter)) - .collect(); + base_positions = next_positions + .iter() + .map(|pos| (*pos, self.current_filter)) + .collect(); matches_left = self.filters.len() - 1; } @@ -557,11 +577,11 @@ impl QueryRuntimeFilter for DistanceFilter { f.prepare_relevancy_scoring(&mut qsi); } } - + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { Ok(()) } - + fn is_all_not(&self) -> bool { false } @@ -586,9 +606,11 @@ impl<'a> AndFilter<'a> { fn result(&mut self, base: Option) -> Option { let mut matches_count = self.filters.len() - 1; - if base.is_none() { return None; } + if base.is_none() { + return None; + } let mut base_result = base.unwrap(); - + base_result.arraypath.resize(self.array_depth, 0); loop { @@ -598,8 +620,10 @@ impl<'a> AndFilter<'a> { } let next = self.filters[self.current_filter].first_result(&base_result); - - if next.is_none() { return None; } + + if next.is_none() { + return None; + } let mut next_result = next.unwrap(); next_result.arraypath.resize(self.array_depth, 0); @@ -641,7 +665,7 @@ impl<'a> QueryRuntimeFilter for AndFilter<'a> { } Ok(()) } - + fn is_all_not(&self) -> bool { for f in self.filters.iter() { if !f.is_all_not() { @@ -669,13 +693,20 @@ impl<'a> FilterWithResult<'a> { } if self.result.is_none() { self.result = self.filter.first_result(start); - } else if self.result.as_ref().unwrap().less(start, self.array_depth) { + } else if self.result + .as_ref() + .unwrap() + .less(start, self.array_depth) { self.result = self.filter.first_result(start); } if self.result.is_none() { self.is_done = true; } else { - self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); + self.result + .as_mut() + .unwrap() + .arraypath + .resize(self.array_depth, 0); } } @@ -689,7 +720,11 @@ impl<'a> FilterWithResult<'a> { if self.result.is_none() { self.is_done = true; } else { - self.result.as_mut().unwrap().arraypath.resize(self.array_depth, 0); + self.result + .as_mut() + .unwrap() + .arraypath + .resize(self.array_depth, 0); } } } @@ -701,20 +736,23 @@ pub struct OrFilter<'a> { impl<'a> OrFilter<'a> { pub fn new(left: Box, - right: Box, - array_depth: usize) -> OrFilter<'a> { + right: Box, + array_depth: usize) + -> OrFilter<'a> { OrFilter { - left: FilterWithResult{filter: left, - result: None, - array_depth: array_depth, - is_done: false, - }, - - right: FilterWithResult{filter: right, - result: None, - array_depth: array_depth, - is_done: false, - }, + left: FilterWithResult { + filter: left, + result: None, + array_depth: array_depth, + is_done: false, + }, + + right: FilterWithResult { + filter: right, + result: None, + array_depth: array_depth, + is_done: false, + }, } } fn take_smallest(&mut self) -> Option { @@ -727,17 +765,17 @@ impl<'a> OrFilter<'a> { // left is smallest, return and put back right self.right.result = Some(right); Some(left) - }, + } Ordering::Greater => { // right is smallest, return and put back left self.left.result = Some(left); Some(right) - }, + } Ordering::Equal => { left.combine(&mut right); self.right.result = Some(right); Some(left) - }, + } } } else { // right doesn't exist. return left @@ -779,7 +817,7 @@ impl<'a> QueryRuntimeFilter for OrFilter<'a> { try!(self.right.filter.check_double_not(parent_is_neg)); Ok(()) } - + fn is_all_not(&self) -> bool { if self.left.filter.is_all_not() && self.right.filter.is_all_not() { true @@ -838,12 +876,13 @@ impl<'a> QueryRuntimeFilter for NotFilter<'a> { fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { if parent_is_neg { return Err(Error::Parse("Logical not (\"!\") is nested inside of another logical not. \ - This is not allowed.".to_string())); + This is not allowed." + .to_string())); } try!(self.filter.check_double_not(true)); Ok(()) } - + fn is_all_not(&self) -> bool { true } @@ -858,10 +897,10 @@ pub struct BindFilter<'a> { } impl<'a> BindFilter<'a> { - pub fn new(bind_var_name: String, filter: Box, - kb: KeyBuilder) -> BindFilter { + kb: KeyBuilder) + -> BindFilter { BindFilter { bind_var_name: bind_var_name, filter: filter, @@ -870,11 +909,11 @@ impl<'a> BindFilter<'a> { option_next: None, } } - + fn collect_results(&mut self, mut first: DocResult) -> Option { let value_key = self.kb.value_key_from_doc_result(&first); first.add_bind_name_result(&self.bind_var_name, value_key); - + while let Some(next) = self.filter.next_result() { if next.seq == first.seq { let value_key = self.kb.value_key_from_doc_result(&next); @@ -928,7 +967,7 @@ impl<'a> QueryRuntimeFilter for BindFilter<'a> { fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { self.filter.check_double_not(parent_is_neg) } - + fn is_all_not(&self) -> bool { self.filter.is_all_not() } @@ -974,9 +1013,8 @@ impl<'a> QueryRuntimeFilter for BoostFilter<'a> { fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { self.filter.check_double_not(parent_is_neg) } - + fn is_all_not(&self) -> bool { self.filter.is_all_not() } } - diff --git a/src/index.rs b/src/index.rs index 9437369..d127284 100644 --- a/src/index.rs +++ b/src/index.rs @@ -16,7 +16,7 @@ use rocksdb::{MergeOperands, IteratorMode, Snapshot as RocksSnapshot, Compaction pub use rocksdb::WriteBatch; use error::Error; -use json_shred::{Shredder}; +use json_shred::Shredder; use key_builder::KeyBuilder; use snapshot::Snapshot; @@ -43,7 +43,7 @@ impl Batch { } pub enum OpenOptions { - Create + Create, } impl Index { @@ -63,7 +63,7 @@ impl Index { rocks_options.set_comparator("noise_cmp", Index::compare_keys); rocks_options.set_merge_operator("noise_merge", Index::sum_merge); rocks_options.set_compaction_filter("noise_compact", Index::compaction_filter); - + let rocks = match rocksdb::DB::open(&rocks_options, name) { Ok(rocks) => rocks, Err(error) => { @@ -75,12 +75,14 @@ impl Index { rocks_options.create_if_missing(true); let rocks = try!(rocksdb::DB::open(&rocks_options, name)); - - let mut bytes = Vec::with_capacity(8*2); - bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); + + let mut bytes = Vec::with_capacity(8 * 2); + bytes + .write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)) + .unwrap(); bytes.write(&Index::convert_u64_to_bytes(0)).unwrap(); try!(rocks.put_opt(b"HDB", &bytes, &rocksdb::WriteOptions::new())); - + rocks } }; @@ -88,9 +90,10 @@ impl Index { // validate header is there let value = try!(rocks.get(b"HDB")).unwrap(); self.rocks = Some(rocks); - assert_eq!(value.len(), 8*2); + assert_eq!(value.len(), 8 * 2); // first 8 is version - assert_eq!(Index::convert_bytes_to_u64(&value[..8]), NOISE_HEADER_VERSION); + assert_eq!(Index::convert_bytes_to_u64(&value[..8]), + NOISE_HEADER_VERSION); // next 8 is high seq self.high_doc_seq = Index::convert_bytes_to_u64(&value[8..]); self.name = name.to_string(); @@ -125,7 +128,7 @@ impl Index { if batch.id_str_in_batch.contains(&docid) { // oops use trying to add some doc 2x to this batch. return Err(Error::Write("Attempt to insert multiple docs with same _id" - .to_string())); + .to_string())); } if let Some((seq, existing_key_values)) = try!(self.gather_doc_fields(&docid)) { shredder.merge_existing_doc(existing_key_values); @@ -137,7 +140,10 @@ impl Index { } } else { // no doc id supplied in document, so we create one. - let docid = Uuid::new(UuidVersion::Random).unwrap().simple().to_string(); + let docid = Uuid::new(UuidVersion::Random) + .unwrap() + .simple() + .to_string(); try!(shredder.add_id(&docid)); self.high_doc_seq += 1; (self.high_doc_seq, docid) @@ -157,12 +163,11 @@ impl Index { if batch.id_str_in_batch.contains(docid) { // oops use trying to delete a doc that's in the batch. Can't happen, return Err(Error::Write("Attempt to delete doc with same _id added earlier" - .to_string())); + .to_string())); } if let Some((seq, key_values)) = try!(self.gather_doc_fields(docid)) { let mut shredder = Shredder::new(); - try!(shredder.delete_existing_doc(docid, seq, key_values, - &mut batch.wb)); + try!(shredder.delete_existing_doc(docid, seq, key_values, &mut batch.wb)); batch.id_str_in_batch.insert(docid.to_string()); Ok(true) } else { @@ -170,18 +175,21 @@ impl Index { } } - fn gather_doc_fields(&self, docid: &str) -> - Result>)>, Error> { + fn gather_doc_fields(&self, + docid: &str) + -> Result>)>, Error> { if let Some(seq) = try!(self.fetch_seq(&docid)) { // collect up all the fields for the existing doc let kb = KeyBuilder::new(); let value_key = kb.value_key(seq); let mut key_values = BTreeMap::new(); - - let mut iter = self.rocks.as_ref().unwrap().iterator(IteratorMode::Start); + + let mut iter = self.rocks + .as_ref() + .unwrap() + .iterator(IteratorMode::Start); // Seek in index to >= entry - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); + iter.set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); loop { let (key, value) = match iter.next() { Some((key, value)) => (key, value), @@ -191,8 +199,8 @@ impl Index { if !key.starts_with(value_key.as_bytes()) { break; } - let key = unsafe{ str::from_utf8_unchecked(&key)}.to_string(); - let value = value.iter().map(|i|*i).collect(); + let key = unsafe { str::from_utf8_unchecked(&key) }.to_string(); + let value = value.iter().map(|i| *i).collect(); key_values.insert(key, value); } return Ok(Some((seq, key_values))); @@ -209,9 +217,13 @@ impl Index { } let rocks = self.rocks.as_ref().unwrap(); - let mut bytes = Vec::with_capacity(8*2); - bytes.write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)).unwrap(); - bytes.write(&Index::convert_u64_to_bytes(self.high_doc_seq)).unwrap(); + let mut bytes = Vec::with_capacity(8 * 2); + bytes + .write(&Index::convert_u64_to_bytes(NOISE_HEADER_VERSION)) + .unwrap(); + bytes + .write(&Index::convert_u64_to_bytes(self.high_doc_seq)) + .unwrap(); try!(batch.wb.put(b"HDB", &bytes)); let status = try!(rocks.write(batch.wb)); @@ -239,13 +251,13 @@ impl Index { for (n, b) in bytes.iter().enumerate() { buffer[n] = *b; } - unsafe{ mem::transmute(buffer) } + unsafe { mem::transmute(buffer) } } - + /// Should not be used generally since it not varint. Used for header fields /// since only one header is in the database it's not a problem with excess size. fn convert_u64_to_bytes(val: u64) -> [u8; 8] { - unsafe{ mem::transmute(val) } + unsafe { mem::transmute(val) } } pub fn convert_bytes_to_i32(bytes: &[u8]) -> i32 { @@ -254,14 +266,14 @@ impl Index { let mut read = Cursor::new(vec); read.read_signed_varint_32().unwrap() } - + pub fn convert_i32_to_bytes(val: i32) -> Vec { let mut bytes = Cursor::new(Vec::new()); assert!(bytes.write_signed_varint_32(val).is_ok()); bytes.into_inner() } - pub fn fetch_seq(&self, id: &str) -> Result, Error> { + pub fn fetch_seq(&self, id: &str) -> Result, Error> { // Fetching an seq is only possible if the index is open // NOTE vmx 2016-10-17: Perhaps that shouldn't panic? assert!(&self.rocks.is_some()); @@ -271,7 +283,7 @@ impl Index { match try!(rocks.get(&key.as_bytes())) { // If there is an id, it's UTF-8 Some(bytes) => Ok(Some(bytes.to_utf8().unwrap().parse().unwrap())), - None => Ok(None) + None => Ok(None), } } @@ -289,8 +301,8 @@ impl Index { fn compare_keys(a: &[u8], b: &[u8]) -> Ordering { let value_prefixes = ['W', 'f', 'T', 'F', 'N']; if value_prefixes.contains(&(a[0] as char)) && value_prefixes.contains(&(b[0] as char)) { - let astr = unsafe {str::from_utf8_unchecked(&a)}; - let bstr = unsafe {str::from_utf8_unchecked(&b)}; + let astr = unsafe { str::from_utf8_unchecked(&a) }; + let bstr = unsafe { str::from_utf8_unchecked(&b) }; KeyBuilder::compare_keys(astr, bstr) } else { a.cmp(b) @@ -298,9 +310,9 @@ impl Index { } fn sum_merge(new_key: &[u8], - existing_val: Option<&[u8]>, - operands: &mut MergeOperands) - -> Vec { + existing_val: Option<&[u8]>, + operands: &mut MergeOperands) + -> Vec { if !(new_key[0] as char == 'C' || new_key[0] as char == 'K') { panic!("unknown key type to merge!"); } @@ -310,7 +322,7 @@ impl Index { } else { 0 }; - + for bytes in operands { count += Index::convert_bytes_to_i32(&bytes); } @@ -351,7 +363,7 @@ mod tests { let id = index.add(r#"{"foo":"bar"}"#, &mut batch).unwrap(); index.flush(batch).unwrap(); - + let mut results = Query::get_matches(r#"find {foo:=="bar"}"#, &index).unwrap(); let query_id = results.get_next_id().unwrap(); assert!(query_id.len() == 32); @@ -375,7 +387,7 @@ mod tests { index.flush(batch).unwrap(); let rocks = index.rocks.as_mut().unwrap(); - + // apparently you need to do compaction twice when there are merges // first one lets the merges happen, the second lets them be collected. // this is acceptable since eventually the keys go away. @@ -397,38 +409,38 @@ mod tests { let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - //let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": ["a","b",["c","d",["e"]]]}"#).unwrap(); - - //index.flush().unwrap(); - let mut batch = Batch::new(); - let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}"#, &mut batch).unwrap(); - + let _ = index + .add(r#"{"_id":"1", "foo":"array", "baz": [1,2,[3,4,[5]]]}"#, + &mut batch) + .unwrap(); + index.flush(batch).unwrap(); { - let rocks = index.rocks.as_mut().unwrap(); + let rocks = index.rocks.as_mut().unwrap(); - let mut results = Vec::new(); - for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { - if key[0] as char == 'V' { - let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); - results.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); + let mut results = Vec::new(); + for (key, value) in rocks.iterator(rocksdb::IteratorMode::Start) { + if key[0] as char == 'V' { + let key_string = unsafe { str::from_utf8_unchecked((&key)) }.to_string(); + results.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); + } } - } - let expected = vec![ - ("V1#._id".to_string(), JsonValue::String("1".to_string())), - ("V1#.baz$0".to_string(), JsonValue::Number(1.0)), - ("V1#.baz$1".to_string(), JsonValue::Number(2.0)), - ("V1#.baz$2$0".to_string(), JsonValue::Number(3.0)), - ("V1#.baz$2$1".to_string(), JsonValue::Number(4.0)), - ("V1#.baz$2$2$0".to_string(), JsonValue::Number(5.0)), - ("V1#.foo".to_string(), JsonValue::String("array".to_string()))]; - assert_eq!(results, expected); + let expected = vec![("V1#._id".to_string(), JsonValue::String("1".to_string())), + ("V1#.baz$0".to_string(), JsonValue::Number(1.0)), + ("V1#.baz$1".to_string(), JsonValue::Number(2.0)), + ("V1#.baz$2$0".to_string(), JsonValue::Number(3.0)), + ("V1#.baz$2$1".to_string(), JsonValue::Number(4.0)), + ("V1#.baz$2$2$0".to_string(), JsonValue::Number(5.0)), + ("V1#.foo".to_string(), JsonValue::String("array".to_string()))]; + assert_eq!(results, expected); } let mut batch = Batch::new(); - let _ = index.add(r#"{"_id":"1", "foo":"array", "baz": []}"#, &mut batch).unwrap(); + let _ = index + .add(r#"{"_id":"1", "foo":"array", "baz": []}"#, &mut batch) + .unwrap(); index.flush(batch).unwrap(); let rocks = index.rocks.as_mut().unwrap(); @@ -440,11 +452,9 @@ mod tests { results.push((key_string, JsonFetcher::bytes_to_json_value(&*value))); } } - let expected = vec![ - ("V1#._id".to_string(), JsonValue::String("1".to_string())), - ("V1#.baz".to_string(), JsonValue::Array(vec![])), - ("V1#.foo".to_string(), JsonValue::String("array".to_string())) - ]; + let expected = vec![("V1#._id".to_string(), JsonValue::String("1".to_string())), + ("V1#.baz".to_string(), JsonValue::Array(vec![])), + ("V1#.foo".to_string(), JsonValue::String("array".to_string()))]; assert_eq!(results, expected); } } diff --git a/src/json_shred.rs b/src/json_shred.rs index 28d1ccd..8ef31ca 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -17,9 +17,12 @@ use key_builder::KeyBuilder; use stems::Stems; use index::Index; -// Good example of using rustc_serialize: https://github.com/ajroetker/beautician/blob/master/src/lib.rs -// Callback based JSON streaming parser: https://github.com/gyscos/json-streamer.rs -// Another parser pased on rustc_serializ: https://github.com/isagalaev/ijson-rust/blob/master/src/test.rs#L11 +// Good example of using rustc_serialize: +// https://github.com/ajroetker/beautician/blob/master/src/lib.rs +// Callback based JSON streaming parser: +// https://github.com/gyscos/json-streamer.rs +// Another parser pased on rustc_serializ: +// https://github.com/isagalaev/ijson-rust/blob/master/src/test.rs#L11 enum ObjectKeyTypes { /// _id field @@ -50,8 +53,12 @@ impl Shredder { } } - fn add_number_entries(kb: &mut KeyBuilder, number: &[u8], docseq: u64, - batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { + fn add_number_entries(kb: &mut KeyBuilder, + number: &[u8], + docseq: u64, + batch: &mut rocksdb::WriteBatch, + delete: bool) + -> Result<(), Error> { // Add/delete the key that is used for range lookups let number_key = kb.number_key(docseq); if delete { @@ -64,8 +71,12 @@ impl Shredder { Ok(()) } - fn add_bool_null_entries(kb: &mut KeyBuilder, prefix: char, docseq: u64, - batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { + fn add_bool_null_entries(kb: &mut KeyBuilder, + prefix: char, + docseq: u64, + batch: &mut rocksdb::WriteBatch, + delete: bool) + -> Result<(), Error> { let key = kb.bool_null_key(prefix, docseq); if delete { try!(batch.delete(&key.as_bytes())); @@ -77,23 +88,30 @@ impl Shredder { Ok(()) } - fn add_stemmed_entries(kb: &mut KeyBuilder, text: &str, docseq: u64, - batch: &mut rocksdb::WriteBatch, delete: bool) -> Result<(), Error> { + fn add_stemmed_entries(kb: &mut KeyBuilder, + text: &str, + docseq: u64, + batch: &mut rocksdb::WriteBatch, + delete: bool) + -> Result<(), Error> { let stems = Stems::new(text); let mut word_to_word_positions = HashMap::new(); let mut total_words: i32 = 0; let mut one_enc_bytes = Cursor::new(Vec::new()); - let num = if delete {-1} else {1}; + let num = if delete { -1 } else { 1 }; assert!(one_enc_bytes.write_signed_varint_32(num).is_ok()); for stem in stems { total_words += 1; let &mut (ref mut word_positions, ref mut count) = - word_to_word_positions.entry(stem.stemmed) - .or_insert((Cursor::new(Vec::new()), 0)); + word_to_word_positions + .entry(stem.stemmed) + .or_insert((Cursor::new(Vec::new()), 0)); if !delete { - assert!(word_positions.write_unsigned_varint_32(stem.word_pos).is_ok()); + assert!(word_positions + .write_unsigned_varint_32(stem.word_pos) + .is_ok()); } *count += 1; } @@ -104,7 +122,7 @@ impl Shredder { try!(batch.delete(&key.into_bytes())); } else { try!(batch.put(&key.into_bytes(), &word_positions.into_inner())); - } + } let key = kb.field_length_key(docseq); if delete { @@ -112,7 +130,7 @@ impl Shredder { } else { try!(batch.put(&key.into_bytes(), &Index::convert_i32_to_bytes(total_words))); } - + let key = kb.keypathword_count_key(&stemmed); if delete { try!(batch.merge(&key.into_bytes(), &Index::convert_i32_to_bytes(-count))); @@ -126,7 +144,7 @@ impl Shredder { Ok(()) } - + fn add_value(&mut self, code: char, value: &[u8]) -> Result<(), Error> { let key = self.kb.value_key_path_only(); let mut buffer = Vec::with_capacity(value.len() + 1); @@ -136,20 +154,24 @@ impl Shredder { Ok(()) } - fn maybe_add_value(&mut self, parser: &Parser, code: char, value: &[u8]) -> Result<(), Error> { + fn maybe_add_value(&mut self, + parser: &Parser, + code: char, + value: &[u8]) + -> Result<(), Error> { match self.extract_key(parser.stack().top()) { ObjectKeyTypes::Id => { if code != 's' && self.kb.keypath_segments_len() == 1 { //nested fields can be _id, not root fields - return Err(Error::Shred( - "Expected string for `_id` field, got another type".to_string())); + return Err(Error::Shred("Expected string for `_id` field, got another type" + .to_string())); } - self.doc_id = Some(unsafe{ str::from_utf8_unchecked(value) }.to_string()); + self.doc_id = Some(unsafe { str::from_utf8_unchecked(value) }.to_string()); self.kb.pop_object_key(); self.kb.push_object_key("_id"); *self.object_keys_indexed.last_mut().unwrap() = true; try!(self.add_value(code, &value)); - }, + } ObjectKeyTypes::Key(key) => { // Pop the dummy object that makes ObjectEnd happy // or the previous object key @@ -157,11 +179,11 @@ impl Shredder { self.kb.push_object_key(&key); *self.object_keys_indexed.last_mut().unwrap() = true; try!(self.add_value(code, &value)); - }, + } ObjectKeyTypes::NoKey => { try!(self.add_value(code, &value)); self.kb.inc_top_array_offset(); - }, + } } Ok(()) } @@ -175,7 +197,7 @@ impl Shredder { } else { ObjectKeyTypes::Key(key.to_string()) } - }, + } _ => ObjectKeyTypes::NoKey, } } @@ -185,8 +207,8 @@ impl Shredder { fn maybe_push_key(&mut self, stack_element: Option) -> Result<(), Error> { if let Some(StackElement::Key(key)) = stack_element { if self.kb.keypath_segments_len() == 1 && key == "_id" { - return Err(Error::Shred( - "Expected string for `_id` field, got another type".to_string())); + return Err(Error::Shred("Expected string for `_id` field, got another type" + .to_string())); } else { // Pop the dummy object that makes ObjectEnd happy // or the previous object key @@ -198,27 +220,30 @@ impl Shredder { Ok(()) } - pub fn add_all_to_batch(&mut self, seq: u64, - batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + pub fn add_all_to_batch(&mut self, + seq: u64, + batch: &mut rocksdb::WriteBatch) + -> Result<(), Error> { for (key, value) in &self.existing_key_value_to_delete { self.kb.clear(); - self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); + self.kb + .parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); match value[0] as char { 's' => { - let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + let text = unsafe { str::from_utf8_unchecked(&value[1..]) }; try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); - }, + } 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); - }, + } 'T' | 'F' | 'N' => { try!(Shredder::add_bool_null_entries(&mut self.kb, value[0] as char, seq, batch, true)); - }, - _ => {}, + } + _ => {} } try!(batch.delete(&key.as_bytes())); } @@ -229,20 +254,20 @@ impl Shredder { self.kb.parse_value_key_path_only(&key); match value[0] as char { 's' => { - let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + let text = unsafe { str::from_utf8_unchecked(&value[1..]) }; try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, false)); - }, + } 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, false)); - }, + } 'T' | 'F' | 'N' => { try!(Shredder::add_bool_null_entries(&mut self.kb, value[0] as char, seq, batch, false)); - }, - _ => {}, + } + _ => {} } let key = self.kb.value_key(seq); try!(batch.put(&key.as_bytes(), &value.as_ref())); @@ -257,29 +282,31 @@ impl Shredder { pub fn delete_existing_doc(&mut self, docid: &str, - seq: u64, + seq: u64, existing: BTreeMap>, - batch: &mut rocksdb::WriteBatch) -> Result<(), Error> { + batch: &mut rocksdb::WriteBatch) + -> Result<(), Error> { self.doc_id = Some(docid.to_string()); for (key, value) in existing.into_iter() { self.kb.clear(); - self.kb.parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); + self.kb + .parse_value_key_path_only(KeyBuilder::value_key_path_only_from_str(&key)); match value[0] as char { 's' => { - let text = unsafe{ str::from_utf8_unchecked(&value[1..]) }; + let text = unsafe { str::from_utf8_unchecked(&value[1..]) }; try!(Shredder::add_stemmed_entries(&mut self.kb, text, seq, batch, true)); - }, + } 'f' => { try!(Shredder::add_number_entries(&mut self.kb, &value, seq, batch, true)); - }, + } 'T' | 'F' | 'N' => { try!(Shredder::add_bool_null_entries(&mut self.kb, value[0] as char, seq, batch, true)); - }, - _ => {}, + } + _ => {} } try!(batch.delete(&key.as_bytes())); } @@ -307,7 +334,8 @@ impl Shredder { self.shredded_key_values.remove(key).unwrap(); } else { // we need to delete these keys and the index keys assocaited with the valuess - self.existing_key_value_to_delete.insert(existing_key, existing_value); + self.existing_key_value_to_delete + .insert(existing_key, existing_value); } } } @@ -331,7 +359,7 @@ impl Shredder { // Just push something to make `ObjectEnd` happy self.kb.push_object_key(""); self.object_keys_indexed.push(false); - }, + } Some(JsonEvent::ObjectEnd) => { self.kb.pop_object_key(); if !self.object_keys_indexed.pop().unwrap() { @@ -340,11 +368,11 @@ impl Shredder { try!(self.maybe_add_value(&parser, 'o', &[])); } self.kb.inc_top_array_offset(); - }, + } Some(JsonEvent::ArrayStart) => { try!(self.maybe_push_key(parser.stack().top())); self.kb.push_array(); - }, + } Some(JsonEvent::ArrayEnd) => { if self.kb.peek_array_offset() == 0 { // this means we never wrote a value because the object was empty. @@ -355,34 +383,34 @@ impl Shredder { self.kb.pop_array(); } self.kb.inc_top_array_offset(); - }, + } Some(JsonEvent::StringValue(value)) => { try!(self.maybe_add_value(&parser, 's', &value.as_bytes())); - }, + } Some(JsonEvent::BooleanValue(tf)) => { - let code = if tf {'T'} else {'F'}; + let code = if tf { 'T' } else { 'F' }; try!(self.maybe_add_value(&parser, code, &[])); - }, + } Some(JsonEvent::I64Value(i)) => { let f = i as f64; - let bytes = unsafe{ transmute::(f) }; + let bytes = unsafe { transmute::(f) }; try!(self.maybe_add_value(&parser, 'f', &bytes[..])); - }, + } Some(JsonEvent::U64Value(u)) => { let f = u as f64; - let bytes = unsafe{ transmute::(f) }; + let bytes = unsafe { transmute::(f) }; try!(self.maybe_add_value(&parser, 'f', &bytes[..])); - }, + } Some(JsonEvent::F64Value(f)) => { - let bytes = unsafe{ transmute::(f) }; + let bytes = unsafe { transmute::(f) }; try!(self.maybe_add_value(&parser, 'f', &bytes[..])); - }, + } Some(JsonEvent::NullValue) => { try!(self.maybe_add_value(&parser, 'N', &[])); - }, + } Some(JsonEvent::Error(error)) => { return Err(Error::Shred(error.to_string())); - }, + } None => { break; } @@ -436,7 +464,7 @@ mod tests { } - #[test] + #[test] fn test_shred_nested() { let mut shredder = super::Shredder::new(); let json = r#"{"some": ["array", "data", ["also", "nested"]]}"#; @@ -456,17 +484,15 @@ mod tests { rocks.write(batch).unwrap(); let result = positions_from_rocks(&rocks); - let expected = vec![ - ("W._id!foo#123,".to_string(), vec![0]), - ("W.some$!array#123,0".to_string(), vec![0]), - ("W.some$!data#123,1".to_string(), vec![0]), - ("W.some$$!also#123,2,0".to_string(), vec![0]), - ("W.some$$!nest#123,2,1".to_string(), vec![0]), - ]; + let expected = vec![("W._id!foo#123,".to_string(), vec![0]), + ("W.some$!array#123,0".to_string(), vec![0]), + ("W.some$!data#123,1".to_string(), vec![0]), + ("W.some$$!also#123,2,0".to_string(), vec![0]), + ("W.some$$!nest#123,2,1".to_string(), vec![0])]; assert_eq!(result, expected); } - #[test] + #[test] fn test_shred_double_nested() { let mut shredder = super::Shredder::new(); let json = r#"{"a":{"a":"b"}}"#; @@ -486,10 +512,8 @@ mod tests { rocks.write(batch).unwrap(); let result = values_from_rocks(&rocks); - let expected = vec![ - ("V123#._id".to_string(), JsonValue::String("foo".to_string())), - ("V123#.a.a".to_string(), JsonValue::String("b".to_string())) - ]; + let expected = vec![("V123#._id".to_string(), JsonValue::String("foo".to_string())), + ("V123#.a.a".to_string(), JsonValue::String("b".to_string()))]; assert_eq!(result, expected); } @@ -516,16 +540,14 @@ mod tests { rocks.write(batch).unwrap(); let result = positions_from_rocks(&rocks); - let expected = vec![ - ("W.A$.B!b1#1234,1".to_string(), vec![0]), - ("W.A$.B!b2vmx#1234,0".to_string(), vec![0]), - ("W.A$.B!three#1234,0".to_string(), vec![10]), - ("W.A$.B!two#1234,0".to_string(), vec![6]), - ("W.A$.C!..#1234,0".to_string(), vec![0]), - ("W.A$.C!..#1234,1".to_string(), vec![0]), - ("W.A$.C!c2#1234,0".to_string(), vec![2]), - ("W.A$.C!c2#1234,1".to_string(), vec![2]), - ]; + let expected = vec![("W.A$.B!b1#1234,1".to_string(), vec![0]), + ("W.A$.B!b2vmx#1234,0".to_string(), vec![0]), + ("W.A$.B!three#1234,0".to_string(), vec![10]), + ("W.A$.B!two#1234,0".to_string(), vec![6]), + ("W.A$.C!..#1234,0".to_string(), vec![0]), + ("W.A$.C!..#1234,1".to_string(), vec![0]), + ("W.A$.C!c2#1234,0".to_string(), vec![2]), + ("W.A$.C!c2#1234,1".to_string(), vec![2])]; assert_eq!(result, expected); } @@ -544,7 +566,7 @@ mod tests { let mut index = Index::new(); index.open(dbname, Some(OpenOptions::Create)).unwrap(); - + let rocks = &index.rocks.unwrap(); rocks.write(batch).unwrap(); diff --git a/src/json_value.rs b/src/json_value.rs index 7151cf8..5e4a225 100644 --- a/src/json_value.rs +++ b/src/json_value.rs @@ -19,8 +19,8 @@ pub enum JsonValue { } impl JsonValue { - pub fn str_to_literal(string: &str) ->String { - let mut ret = String::with_capacity(string.len()*2+2); + pub fn str_to_literal(string: &str) -> String { + let mut ret = String::with_capacity(string.len() * 2 + 2); ret.push('"'); for c in string.chars() { if c == '"' || c == '\\' { @@ -48,10 +48,10 @@ impl JsonValue { } } else { panic!("cast error in cmp_f64"); - } + } } else { panic!("cast error in cmp_f64"); - } + } } fn cmp_string(a: &JsonValue, b: &JsonValue) -> Ordering { @@ -61,10 +61,10 @@ impl JsonValue { a_val.cmp(&b_val) } else { panic!("cast error in cmp_string"); - } + } } else { panic!("cast error in cmp_string"); - } + } } fn cmp_array(a: &JsonValue, b: &JsonValue) -> Ordering { @@ -81,12 +81,12 @@ impl JsonValue { a_val.len().cmp(&b_val.len()) } else { panic!("cast error in cmp_array"); - } + } } else { panic!("cast error in cmp_array"); - } + } } - + fn cmp_object(a: &JsonValue, b: &JsonValue) -> Ordering { if let &JsonValue::Object(ref a_val) = a { if let &JsonValue::Object(ref b_val) = b { @@ -107,10 +107,10 @@ impl JsonValue { a_val.len().cmp(&b_val.len()) } else { panic!("cast error in cmp_object"); - } + } } else { panic!("cast error in cmp_object"); - } + } } fn type_sort_order(&self) -> (usize, fn(&JsonValue, &JsonValue) -> Ordering) { @@ -130,11 +130,11 @@ impl JsonValue { &JsonValue::Number(ref num) => { try!(write.write_all(pretty.prefix())); try!(write.write_all(num.to_string().as_bytes())); - }, + } &JsonValue::String(ref string) => { try!(write.write_all(pretty.prefix())); try!(write.write_all(JsonValue::str_to_literal(&string).as_bytes())) - }, + } &JsonValue::Array(ref array) => { if array.is_empty() { try!(write.write_all(pretty.prefix())); @@ -149,9 +149,7 @@ impl JsonValue { let mut iter = array.iter().peekable(); loop { match iter.next() { - Some(json) => { - try!(json.render(write, pretty)) - }, + Some(json) => try!(json.render(write, pretty)), None => break, } if iter.peek().is_some() { @@ -162,7 +160,7 @@ impl JsonValue { pretty.pop(); try!(write.write_all(pretty.prefix())); try!(write.write_all("]".as_bytes())); - }, + } &JsonValue::Object(ref object) => { if object.is_empty() { try!(write.write_all(pretty.prefix())); @@ -194,19 +192,19 @@ impl JsonValue { pretty.pop(); try!(write.write_all(pretty.prefix())); try!(write.write_all("}".as_bytes())); - }, + } &JsonValue::True => { try!(write.write_all(pretty.prefix())); try!(write.write_all("true".as_bytes())); - }, + } &JsonValue::False => { try!(write.write_all(pretty.prefix())); try!(write.write_all("false".as_bytes())); - }, + } &JsonValue::Null => { try!(write.write_all(pretty.prefix())); try!(write.write_all("null".as_bytes())) - }, + } } Ok(()) } @@ -231,7 +229,7 @@ pub struct PrettyPrint { newline: String, spacing: String, buffer: String, - next_prefix_is_space: bool + next_prefix_is_space: bool, } impl PrettyPrint { diff --git a/src/key_builder.rs b/src/key_builder.rs index 749ec4c..fc4378f 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -7,7 +7,7 @@ use std::cmp::Ordering; pub enum Segment { ObjectKey(String), Array(u64), -} +} #[derive(Debug, Clone)] pub struct KeyBuilder { @@ -17,8 +17,8 @@ pub struct KeyBuilder { impl KeyBuilder { pub fn new() -> KeyBuilder { - KeyBuilder{ - // Magic reserve number is completely arbitrary + KeyBuilder { + // Magic reserve number is completely arbitrary keypath: Vec::with_capacity(10), arraypath: Vec::with_capacity(10), } @@ -51,7 +51,7 @@ impl KeyBuilder { string.push_str(word); string } - + pub fn keypath_count_key(&self) -> String { let mut string = String::with_capacity(100); string.push('K'); @@ -74,7 +74,7 @@ impl KeyBuilder { string.push('f'); for segment in &self.keypath { string.push_str(&segment); - }; + } string.push('#'); string.push_str(&seq.to_string()); @@ -88,7 +88,7 @@ impl KeyBuilder { string.push(prefix); for segment in &self.keypath { string.push_str(&segment); - }; + } string.push('#'); string.push_str(&seq.to_string()); @@ -184,7 +184,7 @@ impl KeyBuilder { } string } - + /// Returns a value key without the doc seq prepended. pub fn value_key_path_only_from_str(str: &str) -> &str { &str[str.find('#').unwrap() + 1..] @@ -197,12 +197,12 @@ impl KeyBuilder { (Segment::ObjectKey(_key), unescaped) => { str = &str[unescaped.len()..]; self.keypath.push(unescaped); - }, + } (Segment::Array(i), unescaped) => { str = &str[unescaped.len()..]; self.keypath.push("$".to_string()); self.arraypath.push(i); - }, + } } } } @@ -234,7 +234,7 @@ impl KeyBuilder { } } - // returns the unescaped segment as Segment and the escaped segment as a slice + // returns the unescaped segment as Segment and the escaped segment as a slice pub fn parse_first_key_value_segment(keypath: &str) -> Option<(Segment, String)> { let mut unescaped = String::with_capacity(50); @@ -253,10 +253,10 @@ impl KeyBuilder { } else { panic!("Escape char found as last char in keypath"); } - }, + } Some('.') | Some('$') => { break; - }, + } Some(c) => { len_bytes += c.len_utf8(); unescaped.push(c); @@ -267,7 +267,7 @@ impl KeyBuilder { } } Some((Segment::ObjectKey(unescaped), keypath[..len_bytes].to_string())) - }, + } Some('$') => { let mut i = String::new(); for c in chars { @@ -277,17 +277,17 @@ impl KeyBuilder { break; } } - Some((Segment::Array(i.parse().unwrap()), keypath[..1+i.len()].to_string())) - }, - Some(_) => None, // we must be past the keypath portion of string. done. + Some((Segment::Array(i.parse().unwrap()), keypath[..1 + i.len()].to_string())) + } + Some(_) => None, // we must be past the keypath portion of string. done. None => None, } - } + } pub fn push_object_key(&mut self, key: &str) { let mut escaped_key = String::with_capacity((key.len() * 2) + 1); // max expansion escaped_key.push('.'); - + for cc in key.chars() { // Escape chars that conflict with delimiters if "\\$.!#".contains(cc) { @@ -354,8 +354,8 @@ impl KeyBuilder { pub fn parse_doc_result_from_key(str: &str) -> DocResult { let mut dr = DocResult::new(); let (_path_str, seq_str, arraypath_str) = - KeyBuilder::split_keypath_seq_arraypath_from_key(&str); - dr.seq = seq_str.parse().unwrap(); + KeyBuilder::split_keypath_seq_arraypath_from_key(&str); + dr.seq = seq_str.parse().unwrap(); if !arraypath_str.is_empty() { for numstr in arraypath_str.split(",") { dr.arraypath.push(numstr.parse().unwrap()); @@ -365,27 +365,23 @@ impl KeyBuilder { } pub fn compare_keys(akey: &str, bkey: &str) -> Ordering { - debug_assert!(akey.starts_with('W') || - akey.starts_with('f') || - akey.starts_with('T') || + debug_assert!(akey.starts_with('W') || akey.starts_with('f') || akey.starts_with('T') || akey.starts_with('F') || akey.starts_with('N')); - debug_assert!(bkey.starts_with('W') || - bkey.starts_with('f') || - bkey.starts_with('T') || + debug_assert!(bkey.starts_with('W') || bkey.starts_with('f') || bkey.starts_with('T') || bkey.starts_with('F') || bkey.starts_with('N')); let (apath_str, aseq_str, aarraypath_str) = - KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); + KeyBuilder::split_keypath_seq_arraypath_from_key(&akey); let (bpath_str, bseq_str, barraypath_str) = - KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); + KeyBuilder::split_keypath_seq_arraypath_from_key(&bkey); match apath_str[0..].cmp(&bpath_str[0..]) { Ordering::Less => Ordering::Less, Ordering::Greater => Ordering::Greater, Ordering::Equal => { let aseq: u64 = aseq_str.parse().unwrap(); - let bseq: u64 = bseq_str.parse().unwrap();; + let bseq: u64 = bseq_str.parse().unwrap(); if aseq < bseq { Ordering::Less } else if aseq > bseq { @@ -423,7 +419,7 @@ impl KeyBuilder { } } } - }, + } } } } @@ -431,7 +427,7 @@ impl KeyBuilder { #[cfg(test)] mod tests { - use super::{KeyBuilder}; + use super::KeyBuilder; use query::DocResult; @@ -458,9 +454,10 @@ mod tests { kb.push_array(); assert_eq!(kb.keypath_segments_len(), 3, "three segments"); - assert_eq!(kb.stemmed_word_key("astemmedword", 123), "W.first.second$!astemmedword#123,0", + assert_eq!(kb.stemmed_word_key("astemmedword", 123), + "W.first.second$!astemmedword#123,0", "Key for six segments is correct"); - + kb.pop_array(); assert_eq!(kb.keypath_segments_len(), 2, "Two segments"); @@ -475,22 +472,24 @@ mod tests { #[test] fn test_doc_result_parse() { let key = "W.foo$.bar$!word#123,1,0".to_string(); - let (keypathstr, seqstr, arraypathstr) = KeyBuilder::split_keypath_seq_arraypath_from_key(&key); + let (keypathstr, seqstr, arraypathstr) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&key); assert_eq!(keypathstr, "W.foo$.bar$!word"); assert_eq!(seqstr, "123"); assert_eq!(arraypathstr, "1,0"); // make sure escaped commas and # in key path don't cause problems let key1 = "W.foo\\#$.bar\\,$!word#123,2,0".to_string(); - let (keypathstr1, seqstr1, arraypathstr1) = KeyBuilder::split_keypath_seq_arraypath_from_key(&key1); + let (keypathstr1, seqstr1, arraypathstr1) = + KeyBuilder::split_keypath_seq_arraypath_from_key(&key1); assert_eq!(keypathstr1, "W.foo\\#$.bar\\,$!word"); assert_eq!(seqstr1, "123"); assert_eq!(arraypathstr1, "2,0"); let mut dr = DocResult::new(); dr.seq = 123; - dr.arraypath = vec![1,0]; - + dr.arraypath = vec![1, 0]; + assert!(dr == KeyBuilder::parse_doc_result_from_key(&key)); } } diff --git a/src/main.rs b/src/main.rs index 45ae6e0..56bb617 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,9 +8,11 @@ use std::io::{self, BufReader}; fn main() { let mut test_mode = false; for argument in env::args() { - if argument == "-t" { - test_mode = true; - } + if argument == "-t" { + test_mode = true; + } } - repl(&mut BufReader::new(io::stdin()), &mut io::stdout(), test_mode); + repl(&mut BufReader::new(io::stdin()), + &mut io::stdout(), + test_mode); } diff --git a/src/parser.rs b/src/parser.rs index 9295a0d..575f9bd 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -58,7 +58,7 @@ impl<'a, 'c> Parser<'a, 'c> { } - fn must_consume(&mut self, token: &str) -> Result<(), Error> { + fn must_consume(&mut self, token: &str) -> Result<(), Error> { if self.could_consume(token) { self.offset += token.len(); self.ws(); @@ -66,11 +66,13 @@ impl<'a, 'c> Parser<'a, 'c> { } else { if self.offset == self.query.len() { Err(Error::Parse(format!("Expected '{}' at character {} but query string ended.", - token, self.offset))) + token, + self.offset))) } else { Err(Error::Parse(format!("Expected '{}' at character {}, found {}.", - token, self.offset, - &self.query[self.offset..self.offset+1]))) + token, + self.offset, + &self.query[self.offset..self.offset + 1]))) } } } @@ -94,7 +96,7 @@ impl<'a, 'c> Parser<'a, 'c> { { let mut chars = self.query[self.offset..].chars(); if let Some(c) = chars.next() { - // first char cannot be numeric + // first char cannot be numeric if c.is_alphabetic() || '_' == c || '$' == c { result.push(c); for c in chars { @@ -106,7 +108,7 @@ impl<'a, 'c> Parser<'a, 'c> { } } } - } + } if result.len() > 0 { self.offset += result.len(); self.ws(); @@ -146,11 +148,10 @@ impl<'a, 'c> Parser<'a, 'c> { Ok(None) } } - - fn consume_aggregate(&mut self) -> Result, - ReturnPath, - JsonValue)>, Error> { + + fn consume_aggregate + (&mut self) + -> Result, ReturnPath, JsonValue)>, Error> { let offset = self.offset; let mut aggregate_fun = if self.consume("group") { AggregateFun::GroupAsc @@ -175,7 +176,7 @@ impl<'a, 'c> Parser<'a, 'c> { } else if self.consume("count") { AggregateFun::Count } else { - return Ok(None) + return Ok(None); }; if self.consume("(") { @@ -232,7 +233,7 @@ impl<'a, 'c> Parser<'a, 'c> { try!(self.must_consume("]")); key } else { - if let Some(key) = self.consume_field() { + if let Some(key) = self.consume_field() { key } else { self.ws(); @@ -285,8 +286,9 @@ impl<'a, 'c> Parser<'a, 'c> { } } - fn consume_boost_and_wrap_filter(&mut self, filter: Box) - -> Result, Error> { + fn consume_boost_and_wrap_filter(&mut self, + filter: Box) + -> Result, Error> { let boost = try!(self.consume_boost()); if boost != 1.0 { Ok(Box::new(BoostFilter::new(filter, boost))) @@ -317,7 +319,11 @@ impl<'a, 'c> Parser<'a, 'c> { // parse the sign c = if c == '-' { result.push('-'); - if let Some(c) = chars.next() { c } else {return Ok(None); } + if let Some(c) = chars.next() { + c + } else { + return Ok(None); + } } else { c }; @@ -327,10 +333,18 @@ impl<'a, 'c> Parser<'a, 'c> { c = if c == '0' { result.push('0'); leading_zero = true; - if let Some(c) = chars.next() { c } else {return Ok(None); } + if let Some(c) = chars.next() { + c + } else { + return Ok(None); + } } else if c >= '1' && c <= '9' { result.push(c); - if let Some(c) = chars.next() { c } else {return Ok(None); } + if let Some(c) = chars.next() { + c + } else { + return Ok(None); + } } else if result.is_empty() { // no sign or digits found. not a number return Ok(None); @@ -398,7 +412,7 @@ impl<'a, 'c> Parser<'a, 'c> { } else { break 'outer; }; - + // parse exponent sign c = if c == '+' || c == '-' { result.push(c); @@ -454,62 +468,65 @@ impl<'a, 'c> Parser<'a, 'c> { // inside the string literal self.offset += 1; { - let mut chars = self.query[self.offset..].chars(); - 'outer: loop { - let char = if let Some(char) = chars.next() { - char - } else { - break; - }; - if char == '\\' { - self.offset += 1; - + let mut chars = self.query[self.offset..].chars(); + 'outer: loop { let char = if let Some(char) = chars.next() { char } else { break; }; - match char { - '\\' | '"' | '/' => lit.push(char), - 'n' => lit.push('\n'), - 'b' => lit.push('\x08'), - 'r' => lit.push('\r'), - 'f' => lit.push('\x0C'), - 't' => lit.push('\t'), - 'v' => lit.push('\x0B'), - 'u' => { - let mut n = 0; - for _i in 0..4 { - let char = if let Some(char) = chars.next() { - char - } else { - break 'outer; - }; - n = match char { - c @ '0' ... '9' => n * 16 + ((c as u16) - ('0' as u16)), - c @ 'a' ... 'f' => n * 16 + (10 + (c as u16) - ('a' as u16)), - c @ 'A' ... 'F' => n * 16 + (10 + (c as u16) - ('A' as u16)), - _ => return Err(Error::Parse(format!( - "Invalid hexidecimal escape: {}", char))), - }; - + if char == '\\' { + self.offset += 1; + + let char = if let Some(char) = chars.next() { + char + } else { + break; + }; + match char { + '\\' | '"' | '/' => lit.push(char), + 'n' => lit.push('\n'), + 'b' => lit.push('\x08'), + 'r' => lit.push('\r'), + 'f' => lit.push('\x0C'), + 't' => lit.push('\t'), + 'v' => lit.push('\x0B'), + 'u' => { + let mut n = 0; + for _i in 0..4 { + let char = if let Some(char) = chars.next() { + char + } else { + break 'outer; + }; + n = match char { + c @ '0'...'9' => n * 16 + ((c as u16) - ('0' as u16)), + c @ 'a'...'f' => n * 16 + (10 + (c as u16) - ('a' as u16)), + c @ 'A'...'F' => n * 16 + (10 + (c as u16) - ('A' as u16)), + _ => { + let msg = format!("Invalid hexidecimal escape: {}", char); + return Err(Error::Parse(msg)); + } + }; + + } + self.offset += 3; // 3 because 1 is always added after the match below } - self.offset += 3; // 3 because 1 is always added after the match below - }, - _ => return Err(Error::Parse(format!("Unknown character escape: {}", - char))), - }; - self.offset += 1; - } else { - if char == '"' { - break; + _ => { + return Err(Error::Parse(format!("Unknown character escape: {}", char))) + } + }; + self.offset += 1; } else { - lit.push(char); - self.offset += char.len_utf8(); + if char == '"' { + break; + } else { + lit.push(char); + self.offset += char.len_utf8(); + } } } } - } try!(self.must_consume("\"")); Ok(Some(lit)) } @@ -524,7 +541,7 @@ impl<'a, 'c> Parser<'a, 'c> { } else { Ok(RangeOperator::Exclusive(num)) } - }, + } _ => panic!("Range operator on other JSON types is not yet implemented!"), } } @@ -548,9 +565,9 @@ impl<'a, 'c> Parser<'a, 'c> { if self.consume("{") { let mut left = try!(self.obool()); try!(self.must_consume("}")); - + left = try!(self.consume_boost_and_wrap_filter(left)); - + if self.consume("&&") { let right = try!(self.not_object()); Ok(Box::new(AndFilter::new(vec![left, right], self.kb.arraypath_len()))) @@ -562,7 +579,7 @@ impl<'a, 'c> Parser<'a, 'c> { Ok(left) } } else { - self.parens() + self.parens() } } @@ -573,7 +590,7 @@ impl<'a, 'c> Parser<'a, 'c> { try!(self.must_consume("(")); let filter = try!(self.object()); try!(self.must_consume(")")); - + self.consume_boost_and_wrap_filter(filter) } @@ -665,32 +682,36 @@ impl<'a, 'c> Parser<'a, 'c> { } } let filter = StemmedPhraseFilter::new(filters); - Box::new(ExactMatchFilter::new(&self.snapshot, filter, self.kb.clone(), literal, true)) - }, + Box::new(ExactMatchFilter::new(&self.snapshot, + filter, + self.kb.clone(), + literal, + true)) + } JsonValue::Number(num) => { Box::new(RangeFilter::new(&self.snapshot, self.kb.clone(), Some(RangeOperator::Inclusive(num)), Some(RangeOperator::Inclusive(num)))) - }, + } JsonValue::True => { Box::new(RangeFilter::new(&self.snapshot, self.kb.clone(), Some(RangeOperator::True), Some(RangeOperator::True))) - }, + } JsonValue::False => { Box::new(RangeFilter::new(&self.snapshot, self.kb.clone(), Some(RangeOperator::False), Some(RangeOperator::False))) - }, + } JsonValue::Null => { Box::new(RangeFilter::new(&self.snapshot, self.kb.clone(), Some(RangeOperator::Null), Some(RangeOperator::Null))) - }, + } _ => panic!("Exact match on other JSON types is not yet implemented!"), }; Ok(filter) @@ -705,24 +726,28 @@ impl<'a, 'c> Parser<'a, 'c> { 0 => panic!("Cannot create a StemmedWordFilter"), 1 => { Ok(Box::new(StemmedWordFilter::new(&self.snapshot, - &stemmed_words[0], &self.kb, boost))) - }, + &stemmed_words[0], + &self.kb, + boost))) + } _ => { let mut filters: Vec = Vec::new(); for stemmed_word in stemmed_words { let filter = StemmedWordPosFilter::new(&self.snapshot, - &stemmed_word, &self.kb, boost); + &stemmed_word, + &self.kb, + boost); filters.push(filter); } Ok(Box::new(StemmedPhraseFilter::new(filters))) - }, + } } } else if self.consume("~") { let word_distance = match try!(self.consume_integer()) { Some(int) => int, None => { return Err(Error::Parse("Expected integer for proximity search".to_string())); - }, + } }; try!(self.must_consume("=")); @@ -731,8 +756,8 @@ impl<'a, 'c> Parser<'a, 'c> { let stems = Stems::new(&literal); let mut filters: Vec = Vec::new(); for stem in stems { - let filter = StemmedWordPosFilter::new(&self.snapshot, - &stem.stemmed, &self.kb, boost); + let filter = + StemmedWordPosFilter::new(&self.snapshot, &stem.stemmed, &self.kb, boost); filters.push(filter); } if word_distance > std::u32::MAX as i64 { @@ -875,8 +900,13 @@ impl<'a, 'c> Parser<'a, 'c> { sort }; - sort_infos.insert(rp.to_key(), SortInfo{field: SortField::FetchValue(rp), - sort: sort, order_to_apply: n, default: default}); + sort_infos.insert(rp.to_key(), + SortInfo { + field: SortField::FetchValue(rp), + sort: sort, + order_to_apply: n, + default: default, + }); } else { try!(self.must_consume("score")); try!(self.must_consume("(")); @@ -893,8 +923,12 @@ impl<'a, 'c> Parser<'a, 'c> { }; sort_infos.insert("score()".to_string(), - SortInfo{field: SortField::Score, order_to_apply: n, - sort: sort, default: JsonValue::Null}); + SortInfo { + field: SortField::Score, + order_to_apply: n, + sort: sort, + default: JsonValue::Null, + }); } if !self.consume(",") { @@ -919,7 +953,12 @@ impl<'a, 'c> Parser<'a, 'c> { } else { let mut rp = ReturnPath::new(); rp.push_object_key("_id".to_string()); - Ok(Box::new(RetValue{rp: rp, ag:None, default: JsonValue::Null, sort_info: None})) + Ok(Box::new(RetValue { + rp: rp, + ag: None, + default: JsonValue::Null, + sort_info: None, + })) } } @@ -941,9 +980,9 @@ impl<'a, 'c> Parser<'a, 'c> { break; } } - + try!(self.must_consume("}")); - Ok(Box::new(RetObject{fields: fields})) + Ok(Box::new(RetObject { fields: fields })) } fn ret_array(&mut self) -> Result, Error> { @@ -960,24 +999,24 @@ impl<'a, 'c> Parser<'a, 'c> { } } try!(self.must_consume("]")); - Ok(Box::new(RetArray{slots: slots})) + Ok(Box::new(RetArray { slots: slots })) } fn ret_value(&mut self) -> Result>, Error> { if self.consume("true") { - return Ok(Some(Box::new(RetLiteral{json: JsonValue::True}))); + return Ok(Some(Box::new(RetLiteral { json: JsonValue::True }))); } else if self.consume("false") { - return Ok(Some(Box::new(RetLiteral{json: JsonValue::False}))); + return Ok(Some(Box::new(RetLiteral { json: JsonValue::False }))); } else if self.consume("null") { - return Ok(Some(Box::new(RetLiteral{json: JsonValue::Null}))); + return Ok(Some(Box::new(RetLiteral { json: JsonValue::Null }))); } else if self.could_consume("score") { let offset = self.offset; let _ = self.consume("score"); if self.consume("(") { try!(self.must_consume(")")); self.needs_scoring = true; - return Ok(Some(Box::new(RetScore{sort_info: None}))); + return Ok(Some(Box::new(RetScore { sort_info: None }))); } else { //wasn't the score, maybe it's a bind variable self.offset = offset; @@ -991,11 +1030,20 @@ impl<'a, 'c> Parser<'a, 'c> { JsonValue::Null }; if let Some(bind_name) = bind_name_option { - Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_rp: rp, - ag: Some((ag, json)), default: default, sort_info:None}))) + Ok(Some(Box::new(RetBind { + bind_name: bind_name, + extra_rp: rp, + ag: Some((ag, json)), + default: default, + sort_info: None, + }))) } else { - Ok(Some(Box::new(RetValue{rp: rp, ag: Some((ag, json)), - default: default, sort_info:None}))) + Ok(Some(Box::new(RetValue { + rp: rp, + ag: Some((ag, json)), + default: default, + sort_info: None, + }))) } } else if let Some(bind_name) = self.consume_field() { let rp = if let Some(rp) = try!(self.consume_keypath()) { @@ -1010,24 +1058,34 @@ impl<'a, 'c> Parser<'a, 'c> { JsonValue::Null }; - Ok(Some(Box::new(RetBind{bind_name: bind_name, extra_rp: rp, - ag: None, default: default, sort_info:None}))) + Ok(Some(Box::new(RetBind { + bind_name: bind_name, + extra_rp: rp, + ag: None, + default: default, + sort_info: None, + }))) } else if let Some(rp) = try!(self.consume_keypath()) { let default = if let Some(default) = try!(self.consume_default()) { default } else { JsonValue::Null }; - - Ok(Some(Box::new(RetValue{rp: rp, ag: None, default: default, sort_info: None}))) + + Ok(Some(Box::new(RetValue { + rp: rp, + ag: None, + default: default, + sort_info: None, + }))) } else if self.could_consume("{") { Ok(Some(try!(self.ret_object()))) } else if self.could_consume("[") { Ok(Some(try!(self.ret_array()))) } else if let Some(string) = try!(self.consume_string_literal()) { - Ok(Some(Box::new(RetLiteral{json: JsonValue::String(string)}))) + Ok(Some(Box::new(RetLiteral { json: JsonValue::String(string) }))) } else if let Some(num) = try!(self.consume_number()) { - Ok(Some(Box::new(RetLiteral{json: JsonValue::Number(num)}))) + Ok(Some(Box::new(RetLiteral { json: JsonValue::Number(num) }))) } else { Ok(None) } @@ -1037,13 +1095,11 @@ impl<'a, 'c> Parser<'a, 'c> { if self.consume("limit") { if let Some(i) = try!(self.consume_integer()) { if i <= 0 { - return Err(Error::Parse("limit must be an integer greater than 0" - .to_string())); + return Err(Error::Parse("limit must be an integer greater than 0".to_string())); } Ok(i as usize) } else { - return Err(Error::Parse("limit expects an integer greater than 0" - .to_string())); + return Err(Error::Parse("limit expects an integer greater than 0".to_string())); } } else { Ok(usize::MAX) @@ -1157,7 +1213,7 @@ mod tests { use super::Parser; use index::{Index, OpenOptions}; - + #[test] fn test_whitespace() { let dbname = "target/tests/test_whitespace"; @@ -1190,7 +1246,8 @@ mod tests { let query = r#"" \n \t test""#.to_string(); let mut parser = Parser::new(&query, snapshot); - assert_eq!(parser.must_consume_string_literal().unwrap(), " \n \t test".to_string()); + assert_eq!(parser.must_consume_string_literal().unwrap(), + " \n \t test".to_string()); } #[test] diff --git a/src/query.rs b/src/query.rs index 8331711..103a97a 100644 --- a/src/query.rs +++ b/src/query.rs @@ -10,7 +10,7 @@ use std::usize; use error::Error; use index::Index; use parser::Parser; -use json_value::{JsonValue}; +use json_value::JsonValue; use filters::QueryRuntimeFilter; use aggregates::AggregateFun; use returnable::{Returnable, RetValue, RetScore, RetHidden, ReturnPath}; @@ -41,7 +41,8 @@ impl DocResult { result_keys.push(result_key); return; } - self.bind_name_result.insert(bind_name.to_string(), vec![result_key]); + self.bind_name_result + .insert(bind_name.to_string(), vec![result_key]); } pub fn combine(&mut self, other: &mut DocResult) { @@ -52,7 +53,8 @@ impl DocResult { result_keys.append(&mut result_keys_other); continue; } - self.bind_name_result.insert(bind_name, result_keys_other); + self.bind_name_result + .insert(bind_name, result_keys_other); } self.scores.append(&mut other.scores); } @@ -162,7 +164,7 @@ impl Query { if index.rocks.is_none() { return Err(Error::Parse("You must open the index first".to_string())); } - + let snapshot = index.new_snapshot(); let mut parser = Parser::new(query, snapshot); let mut filter = try!(parser.build_filter()); @@ -173,11 +175,12 @@ impl Query { try!(filter.check_double_not(false)); if filter.is_all_not() { - return Err(Error::Parse("query cannot be made up of only logical not. Must have at least \ - one match clause not negated.".to_string())); + return Err(Error::Parse("query cannot be made up of only logical not. Must have at \ + least one match clause not negated." + .to_string())); } - - let mut ags = Vec::new(); + + let mut ags = Vec::new(); returnable.get_aggregate_funs(&mut ags); let mut has_ags = false; @@ -191,7 +194,7 @@ impl Query { returnable = if has_sorting && has_ags { return Err(Error::Parse("Cannot have aggregates and sorting in the same query" - .to_string())); + .to_string())); } else if has_sorting { returnable.take_sort_for_matching_fields(&mut sorts); if !sorts.is_empty() { @@ -200,17 +203,22 @@ impl Query { let sort = sort_info.clone(); match sort_info.field { SortField::FetchValue(rp) => { - vec.push(Box::new(RetValue{ rp: rp, - ag: None, - default: sort_info.default, - sort_info: Some(sort)})); - }, + vec.push(Box::new(RetValue { + rp: rp, + ag: None, + default: sort_info.default, + sort_info: Some(sort), + })); + } SortField::Score => { - vec.push(Box::new(RetScore{ sort_info: Some(sort)})); - }, + vec.push(Box::new(RetScore { sort_info: Some(sort) })); + } } } - Box::new(RetHidden{unrendered: vec, visible: returnable}) + Box::new(RetHidden { + unrendered: vec, + visible: returnable, + }) } else { returnable } @@ -223,7 +231,8 @@ impl Query { for option_ag in ags.iter() { if option_ag.is_none() { return Err(Error::Parse("Return keypaths must either all have \ - aggregate functions, or none can them.".to_string())); + aggregate functions, or none can them." + .to_string())); } } } @@ -249,12 +258,15 @@ impl Query { } // order we process sorts is important sorts.sort_by_key(|&(ref sort_info, ref _n)| sort_info.order_to_apply); - sorts.into_iter().map(|(sort_info, n)| (sort_info.sort, n)).collect() + sorts + .into_iter() + .map(|(sort_info, n)| (sort_info.sort, n)) + .collect() } else { Vec::new() }; - - + + let mut does_group_or_aggr = false; let mut aggr_inits = Vec::new(); let mut aggr_actions = Vec::new(); @@ -266,7 +278,7 @@ impl Query { n -= 1; if ag == AggregateFun::GroupAsc { sorts.push((Sort::Asc, n)); - } else if ag == AggregateFun::GroupDesc { + } else if ag == AggregateFun::GroupDesc { sorts.push((Sort::Desc, n)); } else { let ag_impls = ag.get_fun_impls(); @@ -282,38 +294,41 @@ impl Query { // the order we process groups in important sorts.reverse(); } - - let mut qsi = QueryScoringInfo{num_terms: 0, sum_of_idt_sqs: 0.0}; - + + let mut qsi = QueryScoringInfo { + num_terms: 0, + sum_of_idt_sqs: 0.0, + }; + if parser.needs_scoring { filter.prepare_relevancy_scoring(&mut qsi); } let query_norm = if qsi.num_terms > 0 { - 1.0/(qsi.sum_of_idt_sqs as f32) + 1.0 / (qsi.sum_of_idt_sqs as f32) } else { 0.0 }; Ok(QueryResults { - filter: filter, - doc_result_next: DocResult::new(), - fetcher: parser.snapshot.new_json_fetcher(), - snapshot: parser.snapshot, - returnable: returnable, - needs_sorting_and_ags: needs_sorting_and_ags, - done_with_sorting_and_ags: false, - does_group_or_aggr: does_group_or_aggr, - sorts: Some(sorts), - aggr_inits: aggr_inits, - aggr_actions: aggr_actions, - aggr_finals: aggr_finals, - in_buffer: Vec::new(), - sorted_buffer: Vec::new(), - limit: limit, - scoring_num_terms: qsi.num_terms, - scoring_query_norm: query_norm, - }) + filter: filter, + doc_result_next: DocResult::new(), + fetcher: parser.snapshot.new_json_fetcher(), + snapshot: parser.snapshot, + returnable: returnable, + needs_sorting_and_ags: needs_sorting_and_ags, + done_with_sorting_and_ags: false, + does_group_or_aggr: does_group_or_aggr, + sorts: Some(sorts), + aggr_inits: aggr_inits, + aggr_actions: aggr_actions, + aggr_finals: aggr_finals, + in_buffer: Vec::new(), + sorted_buffer: Vec::new(), + limit: limit, + scoring_num_terms: qsi.num_terms, + scoring_query_norm: query_norm, + }) } } @@ -328,9 +343,9 @@ pub struct QueryResults<'a> { done_with_sorting_and_ags: bool, does_group_or_aggr: bool, sorts: Option>, - aggr_inits: Vec<(fn (JsonValue) -> JsonValue, usize)>, - aggr_actions: Vec<(fn (&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, - aggr_finals: Vec<(fn (&mut JsonValue), usize)>, + aggr_inits: Vec<(fn(JsonValue) -> JsonValue, usize)>, + aggr_actions: Vec<(fn(&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, + aggr_finals: Vec<(fn(&mut JsonValue), usize)>, in_buffer: Vec>, sorted_buffer: Vec>, limit: usize, @@ -339,21 +354,20 @@ pub struct QueryResults<'a> { } impl<'a> QueryResults<'a> { - - fn compute_relevancy_score(& self, dr: &DocResult) -> f32 { + fn compute_relevancy_score(&self, dr: &DocResult) -> f32 { if self.scoring_num_terms == 0 { - return 0.0 + return 0.0; } let mut num_terms_matched = 0; let mut score: f32 = 0.0; for &(ref total_term_score, ref num_times_term_matched) in dr.scores.iter() { if *num_times_term_matched > 0 { - score += *total_term_score/(*num_times_term_matched as f32); + score += *total_term_score / (*num_times_term_matched as f32); num_terms_matched += 1; } } - self.scoring_query_norm * score * (num_terms_matched as f32) - / (self.scoring_num_terms as f32) + self.scoring_query_norm * score * (num_terms_matched as f32) / + (self.scoring_num_terms as f32) } fn get_next_result(&mut self) -> Option { @@ -365,7 +379,7 @@ impl<'a> QueryResults<'a> { Some(doc_result) => { self.doc_result_next.seq = doc_result.seq + 1; Some(doc_result) - }, + } None => None, } } @@ -386,9 +400,9 @@ impl<'a> QueryResults<'a> { match self.snapshot.get(&key.as_bytes()) { // If there is an id, it's UTF-8. Strip off type leading byte Some(id) => Some(id.to_utf8().unwrap()[1..].to_string()), - None => None + None => None, } - }, + } None => None, } } @@ -405,13 +419,17 @@ impl<'a> QueryResults<'a> { Some(dr) => { let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - self.returnable.fetch_result(&mut self.fetcher, dr.seq, score, - &dr.bind_name_result, &mut results); + self.returnable + .fetch_result(&mut self.fetcher, + dr.seq, + score, + &dr.bind_name_result, + &mut results); self.in_buffer.push(results); if self.in_buffer.len() == self.limit { self.do_sorting_and_ags(); } - }, + } None => { if !self.done_with_sorting_and_ags { self.do_sorting_and_ags(); @@ -430,7 +448,7 @@ impl<'a> QueryResults<'a> { } else { return None; } - }, + } } } } else { @@ -440,14 +458,20 @@ impl<'a> QueryResults<'a> { }; let score = self.compute_relevancy_score(&dr); let mut results = VecDeque::new(); - self.returnable.fetch_result(&mut self.fetcher, dr.seq, score, - &dr.bind_name_result, &mut results); + self.returnable + .fetch_result(&mut self.fetcher, + dr.seq, + score, + &dr.bind_name_result, + &mut results); Some(self.returnable.json_result(&mut results)) } } fn cmp_results(sorts: &Vec<(Sort, usize)>, - a: &VecDeque, b: &VecDeque) -> Ordering { + a: &VecDeque, + b: &VecDeque) + -> Ordering { for &(ref sort_dir, n) in sorts.iter() { let cmp = if *sort_dir != Sort::Desc { b[n].cmp(&a[n]) @@ -467,7 +491,8 @@ impl<'a> QueryResults<'a> { // we need to put it back before returning. let sorts = self.sorts.take().unwrap(); if !sorts.is_empty() { - self.in_buffer.sort_by(|a, b| QueryResults::cmp_results(&sorts, &a, &b)); + self.in_buffer + .sort_by(|a, b| QueryResults::cmp_results(&sorts, &a, &b)); } // put back self.sorts = Some(sorts); @@ -491,13 +516,13 @@ impl<'a> QueryResults<'a> { new_buffer.push(b); option_a = Some(a); option_b = self.in_buffer.pop(); - }, + } Ordering::Greater => { new_buffer.push(a); option_a = self.sorted_buffer.pop(); option_b = Some(b); - }, + } Ordering::Equal => { new_buffer.push(a); new_buffer.push(b); @@ -511,7 +536,7 @@ impl<'a> QueryResults<'a> { new_buffer.truncate(self.limit); break; } - }, + } (Some(a), None) => { new_buffer.push(a); if new_buffer.len() == self.limit { @@ -524,7 +549,7 @@ impl<'a> QueryResults<'a> { } } break; - }, + } (None, Some(b)) => { new_buffer.push(b); if new_buffer.len() == self.limit { @@ -537,9 +562,9 @@ impl<'a> QueryResults<'a> { } } break; - }, + } (None, None) => break, - } + } } // put back self.sorts = Some(sorts); @@ -550,10 +575,9 @@ impl<'a> QueryResults<'a> { return; } - + //merge the sorted buffers - let mut new_buffer = Vec::with_capacity(self.sorted_buffer.len() + - self.in_buffer.len()); + let mut new_buffer = Vec::with_capacity(self.sorted_buffer.len() + self.in_buffer.len()); let mut option_old = self.sorted_buffer.pop(); let mut option_new = self.in_buffer.pop(); // take out for borrow check @@ -576,12 +600,12 @@ impl<'a> QueryResults<'a> { self.sorted_buffer.push(old); option_old = Some(new); option_new = self.in_buffer.pop(); - }, + } Ordering::Greater => { new_buffer.push(old); option_old = self.sorted_buffer.pop(); option_new = Some(new); - }, + } Ordering::Equal => { for &(ref action, ref user_arg, n) in self.aggr_actions.iter() { // we can't swap out a value of new directly, so this lets us @@ -600,7 +624,7 @@ impl<'a> QueryResults<'a> { self.in_buffer.clear(); break; } - }, + } (Some(old), None) => { new_buffer.push(old); if new_buffer.len() == self.limit { @@ -613,7 +637,7 @@ impl<'a> QueryResults<'a> { } } break; - }, + } (None, Some(mut new)) => { for &(ref init, n) in self.aggr_inits.iter() { // we can't swap out a value of new directly, so this lets us @@ -625,9 +649,9 @@ impl<'a> QueryResults<'a> { } option_old = Some(new); option_new = self.in_buffer.pop(); - }, + } (None, None) => break, - } + } } // put back self.sorts = Some(sorts); @@ -688,7 +712,6 @@ mod tests { index.flush(batch).unwrap(); let mut query_results = Query::get_matches(r#"find {hello:=="world"}"#, &index).unwrap(); - //let mut query_results = Query::get_matches(r#"a.b[foo="bar"]"#.to_string(), &index).unwrap(); println!("query results: {:?}", query_results.get_next_id()); } @@ -702,7 +725,8 @@ mod tests { let mut batch = Batch::new(); for ii in 1..100 { let data = ((ii % 25) + 97) as u8 as char; - let _ = index.add(&format!(r#"{{"_id":"{}", "data": "{}"}}"#, ii, data), &mut batch); + let _ = index.add(&format!(r#"{{"_id":"{}", "data": "{}"}}"#, ii, data), + &mut batch); } index.flush(batch).unwrap(); diff --git a/src/repl.rs b/src/repl.rs index 51d3aa9..07fb1a4 100644 --- a/src/repl.rs +++ b/src/repl.rs @@ -7,8 +7,8 @@ use std::mem; fn is_command(str: &str) -> bool { - let commands = ["find", "add", "create", "drop", "open", - "pretty", "commit", "del", "load", "dumpkeys"]; + let commands = ["find", "add", "create", "drop", "open", "pretty", "commit", "del", "load", + "dumpkeys"]; for command in commands.iter() { if str.starts_with(command) { return true; @@ -102,10 +102,10 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { for key in keys { write!(w, "{}\n", key).unwrap(); } - }, + } Err(reason) => { write!(w, "{}\n", reason).unwrap(); - }, + } } } else if lines.starts_with("add") { match index.add(&lines[3..], &mut batch) { @@ -147,7 +147,7 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { w.write_all(b"\n").unwrap(); } w.write_all(b"]\n").unwrap(); - }, + } Err(reason) => write!(w, "{}\n", reason).unwrap(), } } @@ -155,5 +155,3 @@ pub fn repl(r: &mut BufRead, w: &mut Write, test_mode: bool) { lines.clear(); } } - - diff --git a/src/returnable.rs b/src/returnable.rs index 528f9b3..dac4253 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -23,7 +23,7 @@ pub struct ReturnPath { impl ReturnPath { pub fn new() -> ReturnPath { - ReturnPath{path: Vec::new()} + ReturnPath { path: Vec::new() } } pub fn push_object_key(&mut self, key: String) { @@ -51,14 +51,14 @@ impl ReturnPath { } key.push(cc); } - }, + } &PathSegment::Array(ref i) => { key.push('$'); key.push_str(&i.to_string()); - }, + } &PathSegment::ArrayAll => { key.push_str("$*"); - }, + } } } key @@ -82,7 +82,10 @@ pub trait Returnable { /// and then each nested Returnable will fetch information about the document (fields or /// scores or bind variables etc) and convert them to JsonValues and add them to the result /// VecDeque. - fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque); @@ -114,7 +117,10 @@ pub struct RetObject { } impl Returnable for RetObject { - fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { for &(ref _key, ref field) in self.fields.iter() { @@ -127,17 +133,17 @@ impl Returnable for RetObject { field.get_aggregate_funs(funs); } } - + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - for &mut (ref _key, ref mut field) in self.fields.iter_mut() { + for &mut (ref _key, ref mut field) in self.fields.iter_mut() { field.take_sort_for_matching_fields(map); - } + } } fn get_sorting(&mut self, sorts: &mut Vec>) { - for &mut (ref mut _key, ref mut field) in self.fields.iter_mut() { + for &mut (ref mut _key, ref mut field) in self.fields.iter_mut() { field.get_sorting(sorts); - } + } } fn json_result(&self, results: &mut VecDeque) -> JsonValue { @@ -155,7 +161,10 @@ pub struct RetArray { } impl Returnable for RetArray { - fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { for ref slot in self.slots.iter() { @@ -164,21 +173,21 @@ impl Returnable for RetArray { } fn get_aggregate_funs(&self, funs: &mut Vec>) { - for ref slot in self.slots.iter() { + for ref slot in self.slots.iter() { slot.get_aggregate_funs(funs); } } - + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - for slot in self.slots.iter_mut() { + for slot in self.slots.iter_mut() { slot.take_sort_for_matching_fields(map); - } + } } fn get_sorting(&mut self, sorts: &mut Vec>) { - for ref mut slot in self.slots.iter_mut() { + for ref mut slot in self.slots.iter_mut() { slot.get_sorting(sorts); - } + } } fn json_result(&self, results: &mut VecDeque) -> JsonValue { @@ -198,29 +207,33 @@ pub struct RetHidden { } impl Returnable for RetHidden { - fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, score: f32, + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { for ref unrendered in self.unrendered.iter() { unrendered.fetch_result(fetcher, seq, score, bind_var_keys, result); } - self.visible.fetch_result(fetcher, seq, score, bind_var_keys, result); + self.visible + .fetch_result(fetcher, seq, score, bind_var_keys, result); } fn get_aggregate_funs(&self, funs: &mut Vec>) { self.visible.get_aggregate_funs(funs); } - + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { self.visible.take_sort_for_matching_fields(map); } fn get_sorting(&mut self, sorts: &mut Vec>) { - for ref mut unrendered in self.unrendered.iter_mut() { + for ref mut unrendered in self.unrendered.iter_mut() { unrendered.get_sorting(sorts); } - + self.visible.get_sorting(sorts); } @@ -240,7 +253,10 @@ pub struct RetLiteral { } impl Returnable for RetLiteral { - fn fetch_result(&self, _fetcher: &mut JsonFetcher, _seq: u64, _score: f32, + fn fetch_result(&self, + _fetcher: &mut JsonFetcher, + _seq: u64, + _score: f32, _bind_var_keys: &HashMap>, _result: &mut VecDeque) { } @@ -248,7 +264,7 @@ impl Returnable for RetLiteral { fn get_aggregate_funs(&self, _funs: &mut Vec>) { //noop } - + fn take_sort_for_matching_fields(&mut self, _map: &mut HashMap) { //noop } @@ -274,7 +290,10 @@ pub struct RetValue { impl Returnable for RetValue { - fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, _score: f32, + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + _score: f32, _bind_var_keys: &HashMap>, result: &mut VecDeque) { if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { @@ -292,8 +311,8 @@ impl Returnable for RetValue { fn get_aggregate_funs(&self, funs: &mut Vec>) { funs.push(self.ag.clone()); } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { self.sort_info = map.remove(&self.rp.to_key()); } @@ -322,7 +341,10 @@ pub struct RetBind { } impl Returnable for RetBind { - fn fetch_result(&self, fetcher: &mut JsonFetcher, seq: u64, _score: f32, + fn fetch_result(&self, + fetcher: &mut JsonFetcher, + seq: u64, + _score: f32, bind_var_keys: &HashMap>, result: &mut VecDeque) { @@ -347,8 +369,8 @@ impl Returnable for RetBind { fn get_aggregate_funs(&self, funs: &mut Vec>) { funs.push(self.ag.clone()); } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { self.sort_info = map.remove(&(self.bind_name.to_string() + &self.extra_rp.to_key())); } @@ -371,7 +393,10 @@ pub struct RetScore { } impl Returnable for RetScore { - fn fetch_result(&self, _fetcher: &mut JsonFetcher, _seq: u64, score: f32, + fn fetch_result(&self, + _fetcher: &mut JsonFetcher, + _seq: u64, + score: f32, _bind_var_keys: &HashMap>, result: &mut VecDeque) { result.push_back(JsonValue::Number(score as f64)); @@ -380,8 +405,8 @@ impl Returnable for RetScore { fn get_aggregate_funs(&self, _funs: &mut Vec>) { // noop } - - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + + fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { self.sort_info = map.remove("score()"); } diff --git a/src/snapshot.rs b/src/snapshot.rs index 56ccf22..75c0719 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -22,7 +22,7 @@ pub struct Snapshot<'a> { impl<'a> Snapshot<'a> { pub fn new(rocks: RocksSnapshot) -> Snapshot { - Snapshot{rocks: rocks} + Snapshot { rocks: rocks } } pub fn new_term_doc_result_iterator(&self, term: &str, kb: &KeyBuilder) -> DocResultIterator { @@ -49,9 +49,7 @@ impl<'a> Snapshot<'a> { } pub fn new_json_fetcher(&self) -> JsonFetcher { - JsonFetcher { - iter: self.rocks.iterator(IteratorMode::Start), - } + JsonFetcher { iter: self.rocks.iterator(IteratorMode::Start) } } pub fn new_iterator(&self) -> DBIterator { @@ -65,12 +63,11 @@ pub struct DocResultIterator { } impl DocResultIterator { - pub fn advance_gte(&mut self, start: &DocResult) { KeyBuilder::add_doc_result_to_keypathword(&mut self.keypathword, &start); // Seek in index to >= entry - self.iter.set_mode(IteratorMode::From(self.keypathword.as_bytes(), - rocksdb::Direction::Forward)); + self.iter + .set_mode(IteratorMode::From(self.keypathword.as_bytes(), rocksdb::Direction::Forward)); KeyBuilder::truncate_to_keypathword(&mut self.keypathword); } @@ -78,13 +75,13 @@ impl DocResultIterator { if let Some((key, value)) = self.iter.next() { if !key.starts_with(self.keypathword.as_bytes()) { // we passed the key path we are interested in. nothing left to do */ - return None + return None; } - let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let key_str = unsafe { str::from_utf8_unchecked(&key) }; let dr = KeyBuilder::parse_doc_result_from_key(&key_str); - Some((dr, TermPositions{pos: value.into_vec()})) + Some((dr, TermPositions { pos: value.into_vec() })) } else { None } @@ -117,7 +114,6 @@ pub struct Scorer { } impl Scorer { - pub fn init(&mut self, qsi: &mut QueryScoringInfo) { let key = self.kb.keypathword_count_key(&self.term); let doc_freq = if let Some(bytes) = self.get_value(&key) { @@ -133,16 +129,17 @@ impl Scorer { 0.0 }; - self.idf = 1.0 + (num_docs/(doc_freq + 1.0)).ln(); + self.idf = 1.0 + (num_docs / (doc_freq + 1.0)).ln(); self.term_ordinal = qsi.num_terms; qsi.num_terms += 1; qsi.sum_of_idt_sqs += self.idf * self.idf; } pub fn get_value(&mut self, key: &str) -> Option> { - self.iter.set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); + self.iter + .set_mode(IteratorMode::From(key.as_bytes(), rocksdb::Direction::Forward)); if let Some((ret_key, ret_value)) = self.iter.next() { - if ret_key.len() == key.len() && ret_key.starts_with(key.as_bytes()) { + if ret_key.len() == key.len() && ret_key.starts_with(key.as_bytes()) { Some(ret_value) } else { None @@ -162,7 +159,7 @@ impl Scorer { }; let tf: f32 = (num_matches as f32).sqrt(); - let norm = 1.0/(total_field_words as f32).sqrt(); + let norm = 1.0 / (total_field_words as f32).sqrt(); let score = self.idf * self.idf * tf * norm * self.boost; dr.add_score(self.term_ordinal, score); } @@ -179,26 +176,29 @@ pub struct JsonFetcher { } impl JsonFetcher { - - pub fn fetch(&mut self, seq: u64, mut kb_base: &mut KeyBuilder, rp: &ReturnPath) -> Option { + pub fn fetch(&mut self, + seq: u64, + mut kb_base: &mut KeyBuilder, + rp: &ReturnPath) + -> Option { JsonFetcher::descend_return_path(&mut self.iter, seq, &mut kb_base, &rp, 0) } pub fn bytes_to_json_value(bytes: &[u8]) -> JsonValue { match bytes[0] as char { 's' => { - let string = unsafe{str::from_utf8_unchecked(&bytes[1..])}.to_string(); + let string = unsafe { str::from_utf8_unchecked(&bytes[1..]) }.to_string(); JsonValue::String(string) - }, + } 'f' => { assert!(bytes.len() == 9); let mut bytes2: [u8; 8] = [0; 8]; for (n, b) in bytes[1..9].iter().enumerate() { - bytes2[n] = *b; + bytes2[n] = *b; } - let double: f64 = unsafe{transmute(bytes2)}; + let double: f64 = unsafe { transmute(bytes2) }; JsonValue::Number(double) - }, + } 'T' => JsonValue::True, 'F' => JsonValue::False, 'N' => JsonValue::Null, @@ -210,28 +210,33 @@ impl JsonFetcher { fn return_array(mut array: Vec<(u64, JsonValue)>) -> JsonValue { array.sort_by_key(|tuple| tuple.0); - JsonValue::Array(array.into_iter() - .map(|(_i, json)| json) - .collect()) + JsonValue::Array(array.into_iter().map(|(_i, json)| json).collect()) } - fn descend_return_path(iter: &mut DBIterator, seq: u64, kb: &mut KeyBuilder, - rp: &ReturnPath, mut rp_index: usize) -> Option { - + fn descend_return_path(iter: &mut DBIterator, + seq: u64, + kb: &mut KeyBuilder, + rp: &ReturnPath, + mut rp_index: usize) + -> Option { + while let Some(segment) = rp.nth(rp_index) { rp_index += 1; match segment { &PathSegment::ObjectKey(ref string) => { kb.push_object_key(string); - }, + } &PathSegment::ArrayAll => { let mut i = 0; let mut vec = Vec::new(); loop { kb.push_array_index(i); i += 1; - if let Some(json) = JsonFetcher::descend_return_path(iter, seq, - &mut kb.clone(), rp, rp_index) { + if let Some(json) = JsonFetcher::descend_return_path(iter, + seq, + &mut kb.clone(), + rp, + rp_index) { vec.push(json); kb.pop_array(); } else { @@ -243,15 +248,15 @@ impl JsonFetcher { // Seek in index to >= entry iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - + rocksdb::Direction::Forward)); + if let Some((key, _value)) = iter.next() { if key.starts_with(value_key.as_bytes()) { // yes it exists. loop again. - continue; + continue; } } - + if vec.is_empty() { return None; } else { @@ -259,7 +264,7 @@ impl JsonFetcher { } } } - }, + } &PathSegment::Array(ref index) => { kb.push_array_index(*index); } @@ -269,35 +274,35 @@ impl JsonFetcher { let value_key = kb.value_key(seq); // Seek in index to >= entry - iter.set_mode(IteratorMode::From(value_key.as_bytes(), - rocksdb::Direction::Forward)); - + iter.set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); + let (key, value) = match iter.next() { Some((key, value)) => (key, value), - None => { - return None - }, + None => return None, }; if !key.starts_with(value_key.as_bytes()) { - return None + return None; } Some(JsonFetcher::do_fetch(&mut iter.peekable(), &value_key, key, value)) } - fn do_fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, - mut key: Box<[u8]>, mut value: Box<[u8]>) -> JsonValue { + fn do_fetch(iter: &mut Peekable<&mut DBIterator>, + value_key: &str, + mut key: Box<[u8]>, + mut value: Box<[u8]>) + -> JsonValue { if key.len() == value_key.len() { // we have a key match! return JsonFetcher::bytes_to_json_value(value.as_ref()); } let segment = { - let key_str = unsafe{str::from_utf8_unchecked(&key)}; + let key_str = unsafe { str::from_utf8_unchecked(&key) }; let remaining = &key_str[value_key.len()..]; KeyBuilder::parse_first_key_value_segment(&remaining) }; - + match segment { Some((Segment::ObjectKey(mut unescaped), escaped)) => { let mut object: Vec<(String, JsonValue)> = Vec::new(); @@ -306,18 +311,18 @@ impl JsonFetcher { loop { let json_val = JsonFetcher::do_fetch(iter, &value_key_next, key, value); object.push((unescaped, json_val)); - + let segment = match iter.peek() { Some(&(ref k, ref _v)) => { if !k.starts_with(value_key.as_bytes()) { return JsonValue::Object(object); } - let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let key_str = unsafe { str::from_utf8_unchecked(&k) }; let remaining = &key_str[value_key.len()..]; KeyBuilder::parse_first_key_value_segment(&remaining) - }, + } None => return JsonValue::Object(object), }; @@ -347,18 +352,18 @@ impl JsonFetcher { loop { let json_val = JsonFetcher::do_fetch(iter, &value_key_next, key, value); array.push((i, json_val)); - + let segment = match iter.peek() { Some(&(ref k, ref _v)) => { if !k.starts_with(value_key.as_bytes()) { return JsonFetcher::return_array(array); } - let key_str = unsafe{str::from_utf8_unchecked(&k)}; + let key_str = unsafe { str::from_utf8_unchecked(&k) }; let remaining = &key_str[value_key.len()..]; - KeyBuilder::parse_first_key_value_segment(&remaining) - }, + KeyBuilder::parse_first_key_value_segment(&remaining) + } None => return JsonFetcher::return_array(array), }; @@ -369,7 +374,7 @@ impl JsonFetcher { Some((k, v)) => { key = k; value = v; - }, + } None => panic!("couldn't advanced already peeked iter"), }; value_key_next.truncate(value_key.len()); @@ -378,12 +383,13 @@ impl JsonFetcher { return JsonFetcher::return_array(array); } } - }, + } None => { - let key_str = unsafe{str::from_utf8_unchecked(&key)}; - panic!("somehow couldn't parse key segment {} {}", value_key, key_str); - }, + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + panic!("somehow couldn't parse key segment {} {}", + value_key, + key_str); + } } } } - diff --git a/src/stems.rs b/src/stems.rs index 6c991f0..ee55a20 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -24,7 +24,7 @@ pub struct StemmedWord { impl<'a> Stems<'a> { pub fn new(text: &str) -> Stems { - Stems{ + Stems { words: text.split_word_bound_indices(), stemmer: Stemmer::new("english").unwrap(), word_position: 0, @@ -38,34 +38,34 @@ impl<'a> Iterator for Stems<'a> { fn next(&mut self) -> Option { // we loop though until we find alphabetic chars. That becomes our stem word. let mut non_alpha = String::new(); // will contain any non-alphabetic chars - // returned iff no other alphabetic chars + // returned iff no other alphabetic chars while let Some((_pos, word)) = self.words.next() { let normalized = word.nfkc().collect::(); if normalized.chars().next().unwrap().is_alphabetic() { let pos = self.word_position; self.word_position += 1; return Some(StemmedWord { - word_pos: pos as u32, - stemmed: self.stemmer.stem(&normalized.to_lowercase()), - }); + word_pos: pos as u32, + stemmed: self.stemmer.stem(&normalized.to_lowercase()), + }); } else { if self.word_position == 0 { non_alpha.push_str(&normalized); } } - } + } if non_alpha.is_empty() { if self.word_position == 0 { self.word_position = 1; // in this case we were passed an empty string - // so we don't just return None, but we return + // so we don't just return None, but we return // an empty string Stemmed word. // otherwise searching fields for empty strings // wouldn't be possible. return Some(StemmedWord { - word_pos: 0, - stemmed: String::new(), - }); + word_pos: 0, + stemmed: String::new(), + }); } else { return None; } @@ -73,9 +73,9 @@ impl<'a> Iterator for Stems<'a> { if self.word_position == 0 { self.word_position = 1; return Some(StemmedWord { - word_pos: 0, - stemmed: non_alpha, - }); + word_pos: 0, + stemmed: non_alpha, + }); } else { return None; } @@ -92,16 +92,35 @@ mod tests { fn test_stems_mixedcase() { let input = "THEse Words deeplY test smOOthly that stemmING"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { word_pos: 0, stemmed: String::from("these")}, - StemmedWord { word_pos: 1, stemmed: String::from("word")}, - // "deeply" stems to "deepli" - StemmedWord { word_pos: 2, stemmed: String::from("deepli")}, - StemmedWord { word_pos: 3, stemmed: String::from("test")}, - StemmedWord { word_pos: 4, stemmed: String::from("smooth")}, - StemmedWord { word_pos: 5, stemmed: String::from("that")}, - StemmedWord { word_pos: 6, stemmed: String::from("stem")}, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("these"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("word"), + }, + // "deeply" stems to "deepli" + StemmedWord { + word_pos: 2, + stemmed: String::from("deepli"), + }, + StemmedWord { + word_pos: 3, + stemmed: String::from("test"), + }, + StemmedWord { + word_pos: 4, + stemmed: String::from("smooth"), + }, + StemmedWord { + word_pos: 5, + stemmed: String::from("that"), + }, + StemmedWord { + word_pos: 6, + stemmed: String::from("stem"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -112,17 +131,25 @@ mod tests { fn test_stems_nonchars() { let input = " @#$!== \t+-"; let result = Stems::new(input).collect::>(); - assert_eq!(result, vec![StemmedWord { word_pos: 0, stemmed: String::from(" @#$!== \t+-")}]); + assert_eq!(result, + vec![StemmedWord { + word_pos: 0, + stemmed: String::from(" @#$!== \t+-"), + }]); } #[test] fn test_stems_some_nonchars() { let input = "@!? Let's seeing..."; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { word_pos: 0, stemmed: String::from("let")}, - StemmedWord { word_pos: 1, stemmed: String::from("see")}, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("let"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("see"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -133,10 +160,14 @@ mod tests { fn test_stems_unicode() { let input = "Ünicöde stemming"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { word_pos: 0, stemmed: String::from("ünicöd")}, - StemmedWord { word_pos: 1, stemmed: String::from("stem")}, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("ünicöd"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("stem"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -147,10 +178,14 @@ mod tests { fn test_stems_trailing_needs_normalized() { let input = r#"Didgeridoos™"#; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { word_pos: 0, stemmed: String::from("didgeridoo")}, - StemmedWord { word_pos: 1, stemmed: String::from("tm")}, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("didgeridoo"), + }, + StemmedWord { + word_pos: 1, + stemmed: String::from("tm"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -161,7 +196,10 @@ mod tests { fn test_stems_unicode_lowercase_has_more_bytes() { let input = "İ"; let result = Stems::new(input).collect::>(); - let expected = vec![StemmedWord { word_pos: 0, stemmed: String::from("i̇")}]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("i̇"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); @@ -185,7 +223,8 @@ mod tests { // let upper = "Ρ̓"; // let lower = "ῤ"; // println!("lower({}) == {}: {}", upper, lower, upper.to_lowercase() == lower); - // println!("lower({}) == lower(upper({})): {}", upper, lower, upper.to_lowercase() == lower.to_uppercase().to_lowercase()); + // println!("lower({}) == lower(upper({})): {}", upper, lower, upper.to_lowercase() == + // lower.to_uppercase().to_lowercase()); // lower(Ρ̓) == ῤ: false // lower(Ρ̓) == lower(upper(ῤ)): true #[test] @@ -193,9 +232,10 @@ mod tests { // The input is: Ρ̓ῤῤ (11 bytes), lowercases is ῤῤῤ (9 bytes) let input = "\u{03A1}\u{0313}\u{03C1}\u{0313}\u{1FE4}"; let result = Stems::new(input).collect::>(); - let expected = vec![ - StemmedWord { word_pos: 0, stemmed: String::from("\u{03C1}\u{0313}\u{1FE4}\u{1FE4}")}, - ]; + let expected = vec![StemmedWord { + word_pos: 0, + stemmed: String::from("\u{03C1}\u{0313}\u{1FE4}\u{1FE4}"), + }]; assert_eq!(result.len(), expected.len()); for (stem, expected_stem) in result.iter().zip(expected.iter()) { assert_eq!(stem, expected_stem); diff --git a/tests/repl_tests.rs b/tests/repl_tests.rs index 20eefb3..50f1b8a 100644 --- a/tests/repl_tests.rs +++ b/tests/repl_tests.rs @@ -13,15 +13,18 @@ fn test_repl() { // .reject extension in the same directory where it can be investigated. // To update the test files with new command and output, simply edit/add commands and run - // update-test-repl.sh script from the project root directory. Then examin or do a git diff to see - // if the output is as expected. + // update-test-repl.sh script from the project root directory. Then examin or do a git diff to + // see if the output is as expected. let mut test_dir = env::current_dir().unwrap(); test_dir.push("repl-tests"); let mut failures = 0; let mut total = 0; // Sort files by last modified date to make debugging easier - let mut entries: Vec<_> = fs::read_dir(test_dir).unwrap().map(|r| r.unwrap()).collect(); + let mut entries: Vec<_> = fs::read_dir(test_dir) + .unwrap() + .map(|r| r.unwrap()) + .collect(); entries.sort_by_key(|entry| entry.metadata().unwrap().modified().unwrap()); for entry in entries { let mut path = entry.path(); @@ -29,7 +32,11 @@ fn test_repl() { continue; } total += 1; - let test_name = path.file_name().unwrap().to_str().unwrap().to_string(); + let test_name = path.file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(); println!("About to run test {} ", test_name); let mut file = File::open(path.clone()).unwrap(); let mut file_buffer = Vec::new(); @@ -38,20 +45,30 @@ fn test_repl() { let mut test_result_buffer = Vec::new(); let file = File::open(path.clone()).unwrap(); repl(&mut BufReader::new(file), &mut test_result_buffer, true); - + if file_buffer != test_result_buffer { failures += 1; path.set_extension("reject"); - let reject = path.file_name().unwrap().to_str().unwrap().to_string(); + let reject = path.file_name() + .unwrap() + .to_str() + .unwrap() + .to_string(); let mut file = File::create(path.clone()).unwrap(); file.write_all(&test_result_buffer).unwrap(); file.sync_all().unwrap(); println!("Repl test {} failure. Failing output written to {} in repl-tests dir.", - test_name, reject); + test_name, + reject); } else { - println!("{} successful", path.file_name().unwrap().to_str().unwrap().to_string()); + println!("{} successful", + path.file_name() + .unwrap() + .to_str() + .unwrap() + .to_string()); } } if total == 0 { diff --git a/tests/rocksdb.rs b/tests/rocksdb.rs index 4a80b9c..29a5e45 100644 --- a/tests/rocksdb.rs +++ b/tests/rocksdb.rs @@ -1,5 +1,5 @@ extern crate rocksdb; -use rocksdb::{DB}; +use rocksdb::DB; #[test] fn rocksdb_works() { From bef2926dcf516edea7b14e4d62bd78176449051b Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 25 Apr 2017 13:08:45 -0700 Subject: [PATCH 100/122] Change sort clause to order clause This makes us consistent with other query languages. Plus I kept finding myself wants to say order and ordering instead of sort. Seems more intuitive. --- repl-tests/collation.noise | 14 +-- repl-tests/scoring.noise | 32 +++---- src/parser.rs | 78 ++++++++--------- src/query.rs | 174 ++++++++++++++++++------------------- src/returnable.rs | 82 ++++++++--------- 5 files changed, 190 insertions(+), 190 deletions(-) diff --git a/repl-tests/collation.noise b/repl-tests/collation.noise index c036969..1488f4c 100644 --- a/repl-tests/collation.noise +++ b/repl-tests/collation.noise @@ -1,4 +1,4 @@ -# Sort expressions. +# order expressions. drop target/tests/querytestjsoncollation; create target/tests/querytestjsoncollation; @@ -39,7 +39,7 @@ add {"_id":"17", "foo":"coll", "bar": "string3"}; "17" find {foo: =="coll"} -sort .bar asc +order .bar asc return .bar ; [ null, @@ -62,7 +62,7 @@ true, ] find {foo: =="coll"} -sort .bar asc +order .bar asc return .bar limit 5; [ @@ -74,7 +74,7 @@ true, ] find {foo: =="coll"} -sort .bar asc +order .bar asc return .bar limit 1; [ @@ -103,7 +103,7 @@ add {"_id":"29", "foo":"coll2", "bar":[5,5,5]}; "29" find {foo: =="coll2"} -sort .bar[0] asc, .bar[1] desc, .bar[2] desc +order .bar[0] asc, .bar[1] desc, .bar[2] desc return [.bar[0], .bar[1], .bar[2]] ; [ [1,2,2], @@ -119,7 +119,7 @@ return [.bar[0], .bar[1], .bar[2]] ; ] find {foo: =="coll2"} -sort .bar[0] asc, .bar[1] desc, .bar[2] desc +order .bar[0] asc, .bar[1] desc, .bar[2] desc return [.bar[2], .bar[1], .bar[0]] ; [ [2,2,1], @@ -135,7 +135,7 @@ return [.bar[2], .bar[1], .bar[0]] ; ] find {foo: =="group2"} -sort .baz asc, .bar desc +order .baz asc, .bar desc return [.baz, .bar] limit 2; [] diff --git a/repl-tests/scoring.noise b/repl-tests/scoring.noise index bd43257..7d0cb7e 100644 --- a/repl-tests/scoring.noise +++ b/repl-tests/scoring.noise @@ -12,7 +12,7 @@ add {"_id":"3", "bar": "quick brown fox"}; "3" find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} -sort score() desc +order score() desc return ._id ; [ "3", @@ -21,7 +21,7 @@ return ._id ; ] find {bar: ~="quick brown fox"} -sort score() desc +order score() desc return ._id ; [ "3" @@ -64,7 +64,7 @@ return score() ; ] find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} -sort score() desc +order score() desc return score() ; [ 0.5773501992225647, @@ -73,7 +73,7 @@ return score() ; ] find ({bar: ~="fox" || bar: ~="brown" || bar: ~="quick"})^2 -sort score() desc +order score() desc return score() ; [ 1.1547003984451294, @@ -82,7 +82,7 @@ return score() ; ] find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} -sort score() desc +order score() desc return score() ; [ 0.5773501992225647, @@ -91,7 +91,7 @@ return score() ; ] find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"}^2 -sort score() desc +order score() desc return score() ; [ 1.1547003984451294, @@ -100,7 +100,7 @@ return score() ; ] find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} -sort score() desc +order score() desc return score() ; [ 0.5773501992225647, @@ -109,7 +109,7 @@ return score() ; ] find {bar: ~="fox"^2 || (bar: ~="brown" || bar: ~="quick")^2 } -sort score() desc +order score() desc return score() ; [ 1.1547003984451294, @@ -118,7 +118,7 @@ return score() ; ] find {bar: ~="fox" || bar: ~="brown" || bar: ~="quick"} -sort score() desc +order score() desc return score() ; [ 0.5773501992225647, @@ -127,7 +127,7 @@ return score() ; ] find {bar: ~="fox"}^2 || {bar: ~="brown" || bar: ~="quick"}^2 -sort score() desc +order score() desc return score() ; [ 1.1547003984451294, @@ -143,7 +143,7 @@ add {"_id":"6", "bar": ["quick brown fox"]}; "6" find {bar:[ ~="fox" || ~="brown" || ~="quick"]} -sort score() desc +order score() desc return score() ; [ 0.5773501992225647, @@ -152,7 +152,7 @@ return score() ; ] find {bar:[~="fox" || ~="brown" || ~="quick"]^2} -sort score() desc +order score() desc return score() ; [ 1.1547003984451294, @@ -161,7 +161,7 @@ return score() ; ] find {bar:[ ~="fox" || ~="brown" || ~="quick"]} -sort score() desc +order score() desc return score() ; [ 0.5773501992225647, @@ -170,7 +170,7 @@ return score() ; ] find {bar:[~="fox"]^2 || bar:[~="brown" || ~="quick"]^2} -sort score() desc +order score() desc return score() ; [ 1.1547003984451294, @@ -179,7 +179,7 @@ return score() ; ] find {bar:[ ~="fox" || ~="brown" || ~="quick"]} -sort score() desc +order score() desc return score() ; [ 0.5773501992225647, @@ -188,7 +188,7 @@ return score() ; ] find {bar:[~="fox"]^2 || (bar:[~="brown"] || bar:[~="quick"])^2} -sort score() desc +order score() desc return score() ; [ 1.1547003984451294, diff --git a/src/parser.rs b/src/parser.rs index 575f9bd..eb69495 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -9,7 +9,7 @@ use error::Error; use key_builder::KeyBuilder; use stems::Stems; use json_value::JsonValue; -use query::{Sort, SortInfo, SortField}; +use query::{Order, OrderInfo, OrderField}; use aggregates::AggregateFun; use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, RetScore, ReturnPath}; @@ -865,20 +865,20 @@ impl<'a, 'c> Parser<'a, 'c> { self.consume_boost_and_wrap_filter(filter) } - pub fn sort_clause(&mut self) -> Result, Error> { - let mut sort_infos = HashMap::new(); - if self.consume("sort") { + pub fn order_clause(&mut self) -> Result, Error> { + let mut order_infos = HashMap::new(); + if self.consume("order") { let mut n = 0; loop { if let Some(rp) = try!(self.consume_keypath()) { // doing the search for source 2x so user can order // anyway they like. Yes it's a hack, but it simple. - let mut sort = if self.consume("asc") { - Sort::Asc + let mut order = if self.consume("asc") { + Order::Asc } else if self.consume("desc") { - Sort::Desc + Order::Desc } else { - Sort::Asc + Order::Asc }; let default = if self.consume("default") { @@ -892,21 +892,21 @@ impl<'a, 'c> Parser<'a, 'c> { JsonValue::Null }; - sort = if self.consume("asc") { - Sort::Asc + order = if self.consume("asc") { + Order::Asc } else if self.consume("desc") { - Sort::Desc + Order::Desc } else { - sort + order }; - sort_infos.insert(rp.to_key(), - SortInfo { - field: SortField::FetchValue(rp), - sort: sort, - order_to_apply: n, - default: default, - }); + order_infos.insert(rp.to_key(), + OrderInfo { + field: OrderField::FetchValue(rp), + order: order, + order_to_apply: n, + default: default, + }); } else { try!(self.must_consume("score")); try!(self.must_consume("(")); @@ -914,21 +914,21 @@ impl<'a, 'c> Parser<'a, 'c> { self.needs_scoring = true; - let sort = if self.consume("asc") { - Sort::Asc + let order = if self.consume("asc") { + Order::Asc } else if self.consume("desc") { - Sort::Desc + Order::Desc } else { - Sort::Asc + Order::Asc }; - sort_infos.insert("score()".to_string(), - SortInfo { - field: SortField::Score, - order_to_apply: n, - sort: sort, - default: JsonValue::Null, - }); + order_infos.insert("score()".to_string(), + OrderInfo { + field: OrderField::Score, + order_to_apply: n, + order: order, + default: JsonValue::Null, + }); } if !self.consume(",") { @@ -936,11 +936,11 @@ impl<'a, 'c> Parser<'a, 'c> { } n += 1; } - if sort_infos.is_empty() { - return Err(Error::Parse("Expected field path in sort expression.".to_string())); + if order_infos.is_empty() { + return Err(Error::Parse("Expected field path in order expression.".to_string())); } } - Ok(sort_infos) + Ok(order_infos) } pub fn return_clause(&mut self) -> Result, Error> { @@ -957,7 +957,7 @@ impl<'a, 'c> Parser<'a, 'c> { rp: rp, ag: None, default: JsonValue::Null, - sort_info: None, + order_info: None, })) } } @@ -1016,7 +1016,7 @@ impl<'a, 'c> Parser<'a, 'c> { if self.consume("(") { try!(self.must_consume(")")); self.needs_scoring = true; - return Ok(Some(Box::new(RetScore { sort_info: None }))); + return Ok(Some(Box::new(RetScore { order_info: None }))); } else { //wasn't the score, maybe it's a bind variable self.offset = offset; @@ -1035,14 +1035,14 @@ impl<'a, 'c> Parser<'a, 'c> { extra_rp: rp, ag: Some((ag, json)), default: default, - sort_info: None, + order_info: None, }))) } else { Ok(Some(Box::new(RetValue { rp: rp, ag: Some((ag, json)), default: default, - sort_info: None, + order_info: None, }))) } } else if let Some(bind_name) = self.consume_field() { @@ -1063,7 +1063,7 @@ impl<'a, 'c> Parser<'a, 'c> { extra_rp: rp, ag: None, default: default, - sort_info: None, + order_info: None, }))) } else if let Some(rp) = try!(self.consume_keypath()) { let default = if let Some(default) = try!(self.consume_default()) { @@ -1076,7 +1076,7 @@ impl<'a, 'c> Parser<'a, 'c> { rp: rp, ag: None, default: default, - sort_info: None, + order_info: None, }))) } else if self.could_consume("{") { Ok(Some(try!(self.ret_object()))) diff --git a/src/query.rs b/src/query.rs index 103a97a..4f5c963 100644 --- a/src/query.rs +++ b/src/query.rs @@ -168,7 +168,7 @@ impl Query { let snapshot = index.new_snapshot(); let mut parser = Parser::new(query, snapshot); let mut filter = try!(parser.build_filter()); - let mut sorts = try!(parser.sort_clause()); + let mut orders = try!(parser.order_clause()); let mut returnable = try!(parser.return_clause()); let limit = try!(parser.limit_clause()); try!(parser.non_ws_left()); @@ -190,28 +190,28 @@ impl Query { break; } } - let has_sorting = !sorts.is_empty(); + let has_ordering = !orders.is_empty(); - returnable = if has_sorting && has_ags { - return Err(Error::Parse("Cannot have aggregates and sorting in the same query" + returnable = if has_ordering && has_ags { + return Err(Error::Parse("Cannot have aggregates and ordering in the same query" .to_string())); - } else if has_sorting { - returnable.take_sort_for_matching_fields(&mut sorts); - if !sorts.is_empty() { + } else if has_ordering { + returnable.take_order_for_matching_fields(&mut orders); + if !orders.is_empty() { let mut vec: Vec> = Vec::new(); - for (_key, sort_info) in sorts.into_iter() { - let sort = sort_info.clone(); - match sort_info.field { - SortField::FetchValue(rp) => { + for (_key, order_info) in orders.into_iter() { + let order = order_info.clone(); + match order_info.field { + OrderField::FetchValue(rp) => { vec.push(Box::new(RetValue { rp: rp, ag: None, - default: sort_info.default, - sort_info: Some(sort), + default: order_info.default, + order_info: Some(order), })); } - SortField::Score => { - vec.push(Box::new(RetScore { sort_info: Some(sort) })); + OrderField::Score => { + vec.push(Box::new(RetScore { order_info: Some(order) })); } } } @@ -237,30 +237,30 @@ impl Query { } } - let needs_sorting_and_ags = has_ags || has_sorting; + let needs_ordering_and_ags = has_ags || has_ordering; - // the input args for sorts and ags are vecs where the slot is the same slot as + // the input args for orders and ags are vecs where the slot is the same slot as // a result that the action needs to be applied to. We instead convert them // into several new fields with tuples of action and the slot to act on. // this way we don't needlesss loop over the actions where most are noops - let mut sorts = if has_sorting { - let mut sorts = Vec::new(); - let mut sorting = Vec::new(); - returnable.get_sorting(&mut sorting); - let mut n = sorting.len(); - while let Some(option) = sorting.pop() { + let mut orders = if has_ordering { + let mut orders = Vec::new(); + let mut ordering = Vec::new(); + returnable.get_ordering(&mut ordering); + let mut n = ordering.len(); + while let Some(option) = ordering.pop() { n -= 1; - if let Some(sort_info) = option { - sorts.push((sort_info, n)); + if let Some(order_info) = option { + orders.push((order_info, n)); } } - // order we process sorts is important - sorts.sort_by_key(|&(ref sort_info, ref _n)| sort_info.order_to_apply); - sorts + // order we process orders is important + orders.sort_by_key(|&(ref order_info, ref _n)| order_info.order_to_apply); + orders .into_iter() - .map(|(sort_info, n)| (sort_info.sort, n)) + .map(|(order_info, n)| (order_info.order, n)) .collect() } else { Vec::new() @@ -277,9 +277,9 @@ impl Query { while let Some(Some((ag, user_arg))) = ags.pop() { n -= 1; if ag == AggregateFun::GroupAsc { - sorts.push((Sort::Asc, n)); + orders.push((Order::Asc, n)); } else if ag == AggregateFun::GroupDesc { - sorts.push((Sort::Desc, n)); + orders.push((Order::Desc, n)); } else { let ag_impls = ag.get_fun_impls(); if let Some(init) = ag_impls.init { @@ -292,7 +292,7 @@ impl Query { } } // the order we process groups in important - sorts.reverse(); + orders.reverse(); } let mut qsi = QueryScoringInfo { @@ -316,15 +316,15 @@ impl Query { fetcher: parser.snapshot.new_json_fetcher(), snapshot: parser.snapshot, returnable: returnable, - needs_sorting_and_ags: needs_sorting_and_ags, - done_with_sorting_and_ags: false, + needs_ordering_and_ags: needs_ordering_and_ags, + done_with_ordering_and_ags: false, does_group_or_aggr: does_group_or_aggr, - sorts: Some(sorts), + orders: Some(orders), aggr_inits: aggr_inits, aggr_actions: aggr_actions, aggr_finals: aggr_finals, in_buffer: Vec::new(), - sorted_buffer: Vec::new(), + ordered_buffer: Vec::new(), limit: limit, scoring_num_terms: qsi.num_terms, scoring_query_norm: query_norm, @@ -339,15 +339,15 @@ pub struct QueryResults<'a> { snapshot: Snapshot<'a>, fetcher: JsonFetcher, returnable: Box, - needs_sorting_and_ags: bool, - done_with_sorting_and_ags: bool, + needs_ordering_and_ags: bool, + done_with_ordering_and_ags: bool, does_group_or_aggr: bool, - sorts: Option>, + orders: Option>, aggr_inits: Vec<(fn(JsonValue) -> JsonValue, usize)>, aggr_actions: Vec<(fn(&mut JsonValue, JsonValue, &JsonValue), JsonValue, usize)>, aggr_finals: Vec<(fn(&mut JsonValue), usize)>, in_buffer: Vec>, - sorted_buffer: Vec>, + ordered_buffer: Vec>, limit: usize, scoring_num_terms: usize, scoring_query_norm: f32, @@ -371,7 +371,7 @@ impl<'a> QueryResults<'a> { } fn get_next_result(&mut self) -> Option { - if self.done_with_sorting_and_ags { + if self.done_with_ordering_and_ags { return None; } let result = self.filter.first_result(&self.doc_result_next); @@ -408,9 +408,9 @@ impl<'a> QueryResults<'a> { } pub fn next_result(&mut self) -> Option { - if self.needs_sorting_and_ags { + if self.needs_ordering_and_ags { loop { - let next = if self.done_with_sorting_and_ags { + let next = if self.done_with_ordering_and_ags { None } else { self.get_next_result() @@ -427,23 +427,23 @@ impl<'a> QueryResults<'a> { &mut results); self.in_buffer.push(results); if self.in_buffer.len() == self.limit { - self.do_sorting_and_ags(); + self.do_ordering_and_ags(); } } None => { - if !self.done_with_sorting_and_ags { - self.do_sorting_and_ags(); - self.done_with_sorting_and_ags = true; + if !self.done_with_ordering_and_ags { + self.do_ordering_and_ags(); + self.done_with_ordering_and_ags = true; if !self.aggr_finals.is_empty() { // need to finalize the values - for end in self.sorted_buffer.iter_mut() { + for end in self.ordered_buffer.iter_mut() { for &(ref finalize, n) in self.aggr_finals.iter() { (finalize)(&mut end[n]); } } } } - if let Some(mut results) = self.sorted_buffer.pop() { + if let Some(mut results) = self.ordered_buffer.pop() { return Some(self.returnable.json_result(&mut results)); } else { return None; @@ -468,12 +468,12 @@ impl<'a> QueryResults<'a> { } } - fn cmp_results(sorts: &Vec<(Sort, usize)>, + fn cmp_results(orders: &Vec<(Order, usize)>, a: &VecDeque, b: &VecDeque) -> Ordering { - for &(ref sort_dir, n) in sorts.iter() { - let cmp = if *sort_dir != Sort::Desc { + for &(ref order_dir, n) in orders.iter() { + let cmp = if *order_dir != Order::Desc { b[n].cmp(&a[n]) } else { a[n].cmp(&b[n]) @@ -486,32 +486,32 @@ impl<'a> QueryResults<'a> { Ordering::Equal } - fn do_sorting_and_ags(&mut self) { + fn do_ordering_and_ags(&mut self) { // ugh borrow check madness means this is how this must happen. // we need to put it back before returning. - let sorts = self.sorts.take().unwrap(); - if !sorts.is_empty() { + let orders = self.orders.take().unwrap(); + if !orders.is_empty() { self.in_buffer - .sort_by(|a, b| QueryResults::cmp_results(&sorts, &a, &b)); + .sort_by(|a, b| QueryResults::cmp_results(&orders, &a, &b)); } // put back - self.sorts = Some(sorts); + self.orders = Some(orders); if !self.does_group_or_aggr { - if self.sorted_buffer.is_empty() { - swap(&mut self.sorted_buffer, &mut self.in_buffer); + if self.ordered_buffer.is_empty() { + swap(&mut self.ordered_buffer, &mut self.in_buffer); } else { - //merge the sorted buffers - let mut new_buffer = Vec::with_capacity(self.sorted_buffer.len() + + //merge the ordered buffers + let mut new_buffer = Vec::with_capacity(self.ordered_buffer.len() + self.in_buffer.len()); - let mut option_a = self.sorted_buffer.pop(); + let mut option_a = self.ordered_buffer.pop(); let mut option_b = self.in_buffer.pop(); // take out for borrow check - let sorts = self.sorts.take().unwrap(); + let orders = self.orders.take().unwrap(); loop { match (option_a, option_b) { (Some(a), Some(b)) => { - match QueryResults::cmp_results(&sorts, &a, &b) { + match QueryResults::cmp_results(&orders, &a, &b) { Ordering::Less => { new_buffer.push(b); option_a = Some(a); @@ -519,19 +519,19 @@ impl<'a> QueryResults<'a> { } Ordering::Greater => { new_buffer.push(a); - option_a = self.sorted_buffer.pop(); + option_a = self.ordered_buffer.pop(); option_b = Some(b); } Ordering::Equal => { new_buffer.push(a); new_buffer.push(b); - option_a = self.sorted_buffer.pop(); + option_a = self.ordered_buffer.pop(); option_b = self.in_buffer.pop(); } } if new_buffer.len() >= self.limit { - self.sorted_buffer.clear(); + self.ordered_buffer.clear(); self.in_buffer.clear(); new_buffer.truncate(self.limit); break; @@ -542,7 +542,7 @@ impl<'a> QueryResults<'a> { if new_buffer.len() == self.limit { break; } - while let Some(a) = self.sorted_buffer.pop() { + while let Some(a) = self.ordered_buffer.pop() { new_buffer.push(a); if new_buffer.len() == self.limit { break; @@ -567,25 +567,25 @@ impl<'a> QueryResults<'a> { } } // put back - self.sorts = Some(sorts); + self.orders = Some(orders); new_buffer.reverse(); - swap(&mut self.sorted_buffer, &mut new_buffer); + swap(&mut self.ordered_buffer, &mut new_buffer); } return; } - //merge the sorted buffers - let mut new_buffer = Vec::with_capacity(self.sorted_buffer.len() + self.in_buffer.len()); - let mut option_old = self.sorted_buffer.pop(); + //merge the ordered buffers + let mut new_buffer = Vec::with_capacity(self.ordered_buffer.len() + self.in_buffer.len()); + let mut option_old = self.ordered_buffer.pop(); let mut option_new = self.in_buffer.pop(); // take out for borrow check - let sorts = self.sorts.take().unwrap(); + let orders = self.orders.take().unwrap(); loop { match (option_old, option_new) { (Some(mut old), Some(mut new)) => { - match QueryResults::cmp_results(&sorts, &old, &new) { + match QueryResults::cmp_results(&orders, &old, &new) { Ordering::Less => { for &(ref init, n) in self.aggr_inits.iter() { // we can't swap out a value of new directly, so this lets us @@ -595,15 +595,15 @@ impl<'a> QueryResults<'a> { swap(&mut new_n, &mut new[n]); new[n] = (init)(new_n); } - //push back old value into sorted_buffer, + //push back old value into ordered_buffer, //then use new value as old value. - self.sorted_buffer.push(old); + self.ordered_buffer.push(old); option_old = Some(new); option_new = self.in_buffer.pop(); } Ordering::Greater => { new_buffer.push(old); - option_old = self.sorted_buffer.pop(); + option_old = self.ordered_buffer.pop(); option_new = Some(new); } Ordering::Equal => { @@ -620,7 +620,7 @@ impl<'a> QueryResults<'a> { } } if new_buffer.len() == self.limit { - self.sorted_buffer.clear(); + self.ordered_buffer.clear(); self.in_buffer.clear(); break; } @@ -630,7 +630,7 @@ impl<'a> QueryResults<'a> { if new_buffer.len() == self.limit { break; } - while let Some(old) = self.sorted_buffer.pop() { + while let Some(old) = self.ordered_buffer.pop() { new_buffer.push(old); if new_buffer.len() == self.limit { break; @@ -654,10 +654,10 @@ impl<'a> QueryResults<'a> { } } // put back - self.sorts = Some(sorts); + self.orders = Some(orders); new_buffer.reverse(); - swap(&mut self.sorted_buffer, &mut new_buffer); + swap(&mut self.ordered_buffer, &mut new_buffer); } } @@ -670,22 +670,22 @@ impl<'a> Iterator for QueryResults<'a> { } #[derive(PartialEq, Eq, Clone)] -pub enum Sort { +pub enum Order { Asc, Desc, } #[derive(Clone)] -pub enum SortField { +pub enum OrderField { FetchValue(ReturnPath), Score, } #[derive(Clone)] -pub struct SortInfo { - pub field: SortField, +pub struct OrderInfo { + pub field: OrderField, pub order_to_apply: usize, - pub sort: Sort, + pub order: Order, pub default: JsonValue, } diff --git a/src/returnable.rs b/src/returnable.rs index dac4253..84e8db2 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -5,7 +5,7 @@ use std::collections::VecDeque; use key_builder::KeyBuilder; use json_value::JsonValue; -use query::SortInfo; +use query::OrderInfo; use snapshot::JsonFetcher; use aggregates::AggregateFun; @@ -93,18 +93,18 @@ pub trait Returnable { /// aggregate function it's using and the default value. fn get_aggregate_funs(&self, funs: &mut Vec>); - /// If a query has a sort clause then we want to match the fields being sorted with - /// fields being returned. We pass the sorting info by the path of the sorted fields - /// or scores and Returnables that have the same path will take the sort + /// If a query has a order clause then we want to match the fields being ordered with + /// fields being returned. We pass the ordering info by the path of the ordered fields + /// or scores and Returnables that have the same path will take the order /// information. Any fields not matching a returnable are then added to special hidden - /// Returnable (RetHidden) which fetches those fields for sorting but not rendered or + /// Returnable (RetHidden) which fetches those fields for ordering but not rendered or /// returned. - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap); + fn take_order_for_matching_fields(&mut self, map: &mut HashMap); - /// Each Returnable will return the sorting direction in the same slot as the returnable - /// so that later after fetching they will be sorted by QueryResults after fetching but + /// Each Returnable will return the ordering direction in the same slot as the returnable + /// so that later after fetching they will be ordered by QueryResults after fetching but /// converting to the final json result. - fn get_sorting(&mut self, sorts: &mut Vec>); + fn get_ordering(&mut self, orders: &mut Vec>); /// This is the final step of a Returnable. The previous fetched JsonValues are now /// rendered with other ornamental json elements. @@ -134,15 +134,15 @@ impl Returnable for RetObject { } } - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { for &mut (ref _key, ref mut field) in self.fields.iter_mut() { - field.take_sort_for_matching_fields(map); + field.take_order_for_matching_fields(map); } } - fn get_sorting(&mut self, sorts: &mut Vec>) { + fn get_ordering(&mut self, orders: &mut Vec>) { for &mut (ref mut _key, ref mut field) in self.fields.iter_mut() { - field.get_sorting(sorts); + field.get_ordering(orders); } } @@ -178,15 +178,15 @@ impl Returnable for RetArray { } } - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { for slot in self.slots.iter_mut() { - slot.take_sort_for_matching_fields(map); + slot.take_order_for_matching_fields(map); } } - fn get_sorting(&mut self, sorts: &mut Vec>) { + fn get_ordering(&mut self, orders: &mut Vec>) { for ref mut slot in self.slots.iter_mut() { - slot.get_sorting(sorts); + slot.get_ordering(orders); } } @@ -199,7 +199,7 @@ impl Returnable for RetArray { } } -/// A special returnable that only fetches values for later sorting but never renders +/// A special returnable that only fetches values for later ordering but never renders /// them back to the caller. pub struct RetHidden { pub unrendered: Vec>, @@ -225,21 +225,21 @@ impl Returnable for RetHidden { self.visible.get_aggregate_funs(funs); } - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.visible.take_sort_for_matching_fields(map); + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.visible.take_order_for_matching_fields(map); } - fn get_sorting(&mut self, sorts: &mut Vec>) { + fn get_ordering(&mut self, orders: &mut Vec>) { for ref mut unrendered in self.unrendered.iter_mut() { - unrendered.get_sorting(sorts); + unrendered.get_ordering(orders); } - self.visible.get_sorting(sorts); + self.visible.get_ordering(orders); } fn json_result(&self, results: &mut VecDeque) -> JsonValue { for _n in 0..self.unrendered.len() { - // we already sorted at this point, now discard the values + // we already ordered at this point, now discard the values results.pop_front(); } self.visible.json_result(results) @@ -265,11 +265,11 @@ impl Returnable for RetLiteral { //noop } - fn take_sort_for_matching_fields(&mut self, _map: &mut HashMap) { + fn take_order_for_matching_fields(&mut self, _map: &mut HashMap) { //noop } - fn get_sorting(&mut self, _sorts: &mut Vec>) { + fn get_ordering(&mut self, _orders: &mut Vec>) { //noop } @@ -284,7 +284,7 @@ pub struct RetValue { pub rp: ReturnPath, pub ag: Option<(AggregateFun, JsonValue)>, pub default: JsonValue, - pub sort_info: Option, + pub order_info: Option, } @@ -312,12 +312,12 @@ impl Returnable for RetValue { funs.push(self.ag.clone()); } - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.sort_info = map.remove(&self.rp.to_key()); + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.order_info = map.remove(&self.rp.to_key()); } - fn get_sorting(&mut self, sorts: &mut Vec>) { - sorts.push(self.sort_info.take()); + fn get_ordering(&mut self, orders: &mut Vec>) { + orders.push(self.order_info.take()); } fn json_result(&self, results: &mut VecDeque) -> JsonValue { @@ -337,7 +337,7 @@ pub struct RetBind { pub extra_rp: ReturnPath, pub ag: Option<(AggregateFun, JsonValue)>, pub default: JsonValue, - pub sort_info: Option, + pub order_info: Option, } impl Returnable for RetBind { @@ -370,12 +370,12 @@ impl Returnable for RetBind { funs.push(self.ag.clone()); } - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.sort_info = map.remove(&(self.bind_name.to_string() + &self.extra_rp.to_key())); + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.order_info = map.remove(&(self.bind_name.to_string() + &self.extra_rp.to_key())); } - fn get_sorting(&mut self, sorts: &mut Vec>) { - sorts.push(self.sort_info.take()); + fn get_ordering(&mut self, orders: &mut Vec>) { + orders.push(self.order_info.take()); } fn json_result(&self, results: &mut VecDeque) -> JsonValue { @@ -389,7 +389,7 @@ impl Returnable for RetBind { /// Returns a relevency score for a match. pub struct RetScore { - pub sort_info: Option, + pub order_info: Option, } impl Returnable for RetScore { @@ -406,12 +406,12 @@ impl Returnable for RetScore { // noop } - fn take_sort_for_matching_fields(&mut self, map: &mut HashMap) { - self.sort_info = map.remove("score()"); + fn take_order_for_matching_fields(&mut self, map: &mut HashMap) { + self.order_info = map.remove("score()"); } - fn get_sorting(&mut self, sorts: &mut Vec>) { - sorts.push(self.sort_info.take()); + fn get_ordering(&mut self, orders: &mut Vec>) { + orders.push(self.order_info.take()); } fn json_result(&self, results: &mut VecDeque) -> JsonValue { From 59858d4226e9d6b4173f7e382d7367c653f0c4a2 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 25 Apr 2017 13:12:40 -0700 Subject: [PATCH 101/122] Bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 2ba77aa..7c13cf9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "noise_search" -version = "0.1.0" +version = "0.2.0" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" homepage = "https://github.com/pipedown/noise.git" From 99ecaa2b0f75a982fe9a02998264a8723b598387 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 25 Apr 2017 19:01:02 -0700 Subject: [PATCH 102/122] Fix test updater to point to correct executable Earlier changes in the cargo.toml file means the executable has a different filename. --- update-test-repl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/update-test-repl.sh b/update-test-repl.sh index f127601..e8e2937 100755 --- a/update-test-repl.sh +++ b/update-test-repl.sh @@ -4,7 +4,7 @@ SCRIPTPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" DIRNAME="$(dirname ${SCRIPTPATH})" -NOISE="${DIRNAME}/target/debug/noise" +NOISE="${DIRNAME}/target/debug/noise_search" REPL_TEST_DIR="${DIRNAME}/repl-tests" if [[ ! -f "${NOISE}" ]]; then From 22730e61f109bf2d893cb224e19ee82ef32e4705 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 25 Apr 2017 19:06:14 -0700 Subject: [PATCH 103/122] Fix for issue #21 `default` return with single digit doesn't work Instead of early returning Ok(None) now correctly break out of loop and parse single digits. --- repl-tests/query_basic.noise | 18 ++++++++++++++++++ src/parser.rs | 6 +++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise index cd718f0..9ff1b41 100644 --- a/repl-tests/query_basic.noise +++ b/repl-tests/query_basic.noise @@ -220,6 +220,24 @@ return {foo: .B default={bar:"bar"}}; {"foo":{"bar":"bar"}} ] +find {A:[ == "foo"]} +return .B default=0; +[ +0 +] + +find {A:[ == "foo"]} +return .B default=1; +[ +1 +] + +find {A:[ == "foo"]} +return .B default=-1; +[ +-1 +] + # return every kind of element find {A:[ == "foo"]} diff --git a/src/parser.rs b/src/parser.rs index eb69495..04b0724 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -322,7 +322,7 @@ impl<'a, 'c> Parser<'a, 'c> { if let Some(c) = chars.next() { c } else { - return Ok(None); + return Err(Error::Parse("Expected digits after sign (-).".to_string())); } } else { c @@ -336,14 +336,14 @@ impl<'a, 'c> Parser<'a, 'c> { if let Some(c) = chars.next() { c } else { - return Ok(None); + break 'outer; } } else if c >= '1' && c <= '9' { result.push(c); if let Some(c) = chars.next() { c } else { - return Ok(None); + break 'outer; } } else if result.is_empty() { // no sign or digits found. not a number From 7b75c67a11b599ba1bd93da2d5c70ddb552b92e2 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 27 Apr 2017 15:25:13 -0700 Subject: [PATCH 104/122] Scoring for comparison operators For comparison operators the term score should always be 1. --- repl-tests/ranges.noise | 120 +++++++++++++++++++++++++++++++++++++++ repl-tests/scoring.noise | 4 +- src/filters.rs | 29 ++++++++-- 3 files changed, 147 insertions(+), 6 deletions(-) diff --git a/repl-tests/ranges.noise b/repl-tests/ranges.noise index e1bb0c0..ba75e81 100644 --- a/repl-tests/ranges.noise +++ b/repl-tests/ranges.noise @@ -138,3 +138,123 @@ find {boolarray: [==true]}; "nine", "ten" ] + +# scoring + +find {A: ==true} +return score(); +[ +1 +] + +find {A: ==false} +return score(); +[ +1 +] + +find {A: ==null} +return score(); +[ +1 +] + +find {boolarray: [==true]} +return score(); +[ +1, +1 +] + + + +find {A: >10, A: <20} +return score(); +[ +1 +] + +find {A: >-10, A: <20} +return score(); +[ +1, +1 +] + +find {A: <-1} +return score(); +[ +1 +] + +find {A: <20} +return score(); +[ +1, +1 +] + +find {numberarray: [<70]} +return score(); +[ +1 +] + +find {A: <-3} +return score(); +[] + +find {A: <=-3} +return score(); +[ +1 +] + + + +find {A: >20} +return score(); +[ +1 +] + +find {A: >-5} +return score(); +[ +1, +1, +1 +] + +find {numberarray: [>40]} +return score(); +[ +1 +] + +find {A: >35} +return score(); +[] + +find {A: >=35 || NotAField: ==50} +return score(); +[ +0.25 +] + +find {A: >=35 && NotAField: ==50} +return score(); +[] + + +find {A: ==12} +return score(); +[ +1 +] + +find {numberarray: [==60]} +return score(); +[ +1 +] diff --git a/repl-tests/scoring.noise b/repl-tests/scoring.noise index 7d0cb7e..74622de 100644 --- a/repl-tests/scoring.noise +++ b/repl-tests/scoring.noise @@ -42,13 +42,13 @@ return score() ; find {bar: =="quick brown fox"} return score() ; [ -0.05966803431510925 +1 ] find {bar: =="quick brown fox"^2} return score() ; [ -0.1193360686302185 +1 ] find {bar: ~2="quick brown fox"} diff --git a/src/filters.rs b/src/filters.rs index ab7a4bd..cd8c85f 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -242,6 +242,7 @@ pub struct ExactMatchFilter { kb: KeyBuilder, phrase: String, case_sensitive: bool, + term_ordinal: Option, } impl ExactMatchFilter { @@ -261,6 +262,7 @@ impl ExactMatchFilter { phrase.to_lowercase() }, case_sensitive: case_sensitive, + term_ordinal: None, } } @@ -280,6 +282,9 @@ impl ExactMatchFilter { self.phrase == string.to_lowercase() }; if matches { + if self.term_ordinal.is_some() { + dr.add_score(self.term_ordinal.unwrap(), 1.0); + } return Some(dr); } else { if let Some(next) = self.filter.next_result() { @@ -317,7 +322,10 @@ impl QueryRuntimeFilter for ExactMatchFilter { } fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { - self.filter.prepare_relevancy_scoring(&mut qsi); + // we score these as binary. Either they have a value of 1 or nothing. + self.term_ordinal = Some(qsi.num_terms); + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += 1.0; } fn check_double_not(&self, parent_is_neg: bool) -> Result<(), Error> { @@ -335,6 +343,7 @@ pub struct RangeFilter { min: Option, max: Option, keypath: String, + term_ordinal: Option, } impl RangeFilter { @@ -350,6 +359,7 @@ impl RangeFilter { max: max, // The keypath we use to seek to the correct key within RocksDB keypath: String::new(), + term_ordinal: None, } } } @@ -388,7 +398,10 @@ impl QueryRuntimeFilter for RangeFilter { // The key already matched, hence it's a valid doc result. Return it. if self.min == Some(RangeOperator::True) || self.min == Some(RangeOperator::False) || self.min == Some(RangeOperator::Null) { - let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); + if self.term_ordinal.is_some() { + dr.add_score(self.term_ordinal.unwrap(), 1.0); + } return Some(dr); } // Else it's a range query on numbers @@ -414,7 +427,10 @@ impl QueryRuntimeFilter for RangeFilter { }; if min_condition && max_condition { - let dr = KeyBuilder::parse_doc_result_from_key(&key_str); + let mut dr = KeyBuilder::parse_doc_result_from_key(&key_str); + if self.term_ordinal.is_some() { + dr.add_score(self.term_ordinal.unwrap(), 1.0); + } return Some(dr); } // Else: No match => KKeep looping and move on to the next key @@ -423,7 +439,12 @@ impl QueryRuntimeFilter for RangeFilter { } // TODO vmx 2017-04-13: Scoring is not implemented yet - fn prepare_relevancy_scoring(&mut self, _qsi: &mut QueryScoringInfo) {} + fn prepare_relevancy_scoring(&mut self, qsi: &mut QueryScoringInfo) { + // we score these as binary. Either they have a value of 1 or nothing. + self.term_ordinal = Some(qsi.num_terms); + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += 1.0; + } fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { Ok(()) From 65c0bd9c05e3d8c5b144073c2376e86c0fbe341f Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 27 Apr 2017 17:24:20 -0700 Subject: [PATCH 105/122] fix for empty doc problem https://github.com/pipedown/node-noise/pull/5 --- src/index.rs | 20 ++++++++++++++++++++ src/json_shred.rs | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index d127284..6a37c1c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -457,4 +457,24 @@ mod tests { ("V1#.foo".to_string(), JsonValue::String("array".to_string()))]; assert_eq!(results, expected); } + + #[test] + fn test_empty_doc() { + let dbname = "target/tests/testemptydoc"; + let _ = Index::drop(dbname); + + let mut index = Index::new(); + index.open(dbname, Some(OpenOptions::Create)).unwrap(); + + let mut batch = Batch::new(); + let id = index.add("{}", &mut batch).unwrap(); + + index.flush(batch).unwrap(); + let query = r#"find {_id:==""#.to_string() + &id + "\"} return ."; + let mut results = Query::get_matches(&query, &index).unwrap(); + let json = results.next().unwrap(); + assert_eq!(json, + JsonValue::Object(vec![("_id".to_string(), JsonValue::String(id))])); + + } } diff --git a/src/json_shred.rs b/src/json_shred.rs index 8ef31ca..5842801 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -362,9 +362,11 @@ impl Shredder { } Some(JsonEvent::ObjectEnd) => { self.kb.pop_object_key(); - if !self.object_keys_indexed.pop().unwrap() { + if self.kb.keypath_segments_len() > 0 && + !self.object_keys_indexed.pop().unwrap() { // this means we never wrote a key because the object was empty. // So preserve the empty object by writing a special value. + // but not for the root object. it will always have _id field added. try!(self.maybe_add_value(&parser, 'o', &[])); } self.kb.inc_top_array_offset(); From fc0ac2ed59fe607059bc5a5b2259283242a2cd08 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Thu, 27 Apr 2017 17:41:14 -0700 Subject: [PATCH 106/122] Bump version for bug fix. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7c13cf9..f35548a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "noise_search" -version = "0.2.0" +version = "0.2.1" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" homepage = "https://github.com/pipedown/noise.git" From 8a85211c2abcedab838de3826211aa6f03c606de Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Tue, 2 May 2017 23:33:42 -0700 Subject: [PATCH 107/122] Fix for key prefix bug #24 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of checking if the supplied key is a lexical prefix, check to make sure it’s a keypath prefix. --- repl-tests/query_basic.noise | 17 +++++++++++++++++ src/key_builder.rs | 14 ++++++++++++++ src/snapshot.rs | 13 ++++++++++--- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise index 9ff1b41..b1c7b3c 100644 --- a/repl-tests/query_basic.noise +++ b/repl-tests/query_basic.noise @@ -272,4 +272,21 @@ return .a[*].b[*].c; [[2],[4,5,6]] ] +# prefix bug + +add {"_id":"1", "prefix": true, "pre": "foo"}; +"1" + +find {prefix: == true} +return .pre; +[ +"foo" +] + +find {prefix: == true} +return .pref; +[ +null +] + diff --git a/src/key_builder.rs b/src/key_builder.rs index fc4378f..0ff4a8b 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -234,6 +234,20 @@ impl KeyBuilder { } } + // Returns true if the prefix str is a prefix of the true keypath + pub fn is_keypath_prefix(prefix: &str, keypath: &str) -> bool { + if keypath.starts_with(prefix) { + match keypath[prefix.len()..].chars().next() { + Some('.') => true, + Some('$') => true, + Some(_) => false, + None => true, + } + } else { + false + } + } + // returns the unescaped segment as Segment and the escaped segment as a slice pub fn parse_first_key_value_segment(keypath: &str) -> Option<(Segment, String)> { diff --git a/src/snapshot.rs b/src/snapshot.rs index 75c0719..8be8709 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -281,12 +281,17 @@ impl JsonFetcher { None => return None, }; - if !key.starts_with(value_key.as_bytes()) { + if !KeyBuilder::is_keypath_prefix(&value_key, unsafe { str::from_utf8_unchecked(&key) }) { return None; } Some(JsonFetcher::do_fetch(&mut iter.peekable(), &value_key, key, value)) } + /// When do_fetch is called it means we know we are going to find a value because + /// we prefix matched the keypath. What we are doing here is parsing the remaining + /// keypath to figure out the nested structure of the remaining keypath. So we + /// depth first recursively parse the keypath and return the value and inserting into + /// containers (arrays or objects) then iterate keys until the keypath no longer matches. fn do_fetch(iter: &mut Peekable<&mut DBIterator>, value_key: &str, mut key: Box<[u8]>, @@ -314,7 +319,8 @@ impl JsonFetcher { let segment = match iter.peek() { Some(&(ref k, ref _v)) => { - if !k.starts_with(value_key.as_bytes()) { + let key = unsafe { str::from_utf8_unchecked(k) }; + if !KeyBuilder::is_keypath_prefix(value_key, key) { return JsonValue::Object(object); } @@ -355,7 +361,8 @@ impl JsonFetcher { let segment = match iter.peek() { Some(&(ref k, ref _v)) => { - if !k.starts_with(value_key.as_bytes()) { + let key = unsafe { str::from_utf8_unchecked(k) }; + if !KeyBuilder::is_keypath_prefix(value_key, key) { return JsonFetcher::return_array(array); } From fc7a96179882bdc555b19a789606f66dd63cf937 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 3 May 2017 00:15:02 -0700 Subject: [PATCH 108/122] Fix limit when results aren't sorted or grouped Also changed syntax so that when specifying a return path with a `.` there can be no whitespace between the `.` and the filename. subsequent dots in the path and brackets, etc, still can have whitespace. This change is because when a limit clause is after a `return .` the limit keyword is interpreted as the field name. --- repl-tests/limit.noise | 132 +++++++++++++++++++++++++++++++++++++++++ src/parser.rs | 11 +++- src/query.rs | 4 ++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 repl-tests/limit.noise diff --git a/repl-tests/limit.noise b/repl-tests/limit.noise new file mode 100644 index 0000000..15c7b66 --- /dev/null +++ b/repl-tests/limit.noise @@ -0,0 +1,132 @@ +# limit clause tests + +drop target/tests/querytestlimit; +create target/tests/querytestlimit; + + +add {"_id":"1", "A": 6}; +"1" +add {"_id":"2", "A": 6}; +"2" +add {"_id":"3", "A": 4}; +"3" +add {"_id":"4", "A": 4}; +"4" +add {"_id":"5", "A": 1}; +"5" + +# "limit" tests with find clause only + +find {A: >= 1}; +[ +"1", +"2", +"3", +"4", +"5" +] + +find {A: >= 1} +limit 1; +[ +"1" +] + +find {A: >= 1} +limit 3; +[ +"1", +"2", +"3" +] + +find {A: < 5}; +[ +"3", +"4", +"5" +] + +find {A: < 5} +limit 2; +[ +"3", +"4" +] + +# "limit" tests with ordering + +find {A: > 3} +order .A; +[ +"4", +"3", +"2", +"1" +] + +find {A: > 3} +order .A +limit 1; +[ +"3" +] + +# "limit" tests with return + +find {A: >= 1} +return .; +[ +{"A":6,"_id":"1"}, +{"A":6,"_id":"2"}, +{"A":4,"_id":"3"}, +{"A":4,"_id":"4"}, +{"A":1,"_id":"5"} +] + +find {A: >= 1} +return . +limit 1; +[ +{"A":6,"_id":"1"} +] + +find {A: >= 1} +return .A; +[ +6, +6, +4, +4, +1 +] + +find {A: >= 1} +return .A +limit 1; +[ +6 +] + +# "limit" tests with return and ordering + +find {A: >= 1} +order .A +return .A; +[ +1, +4, +4, +6, +6 +] + +find {A: >= 1} +order .A +return .A +limit 3; +[ +1, +4, +4 +] diff --git a/src/parser.rs b/src/parser.rs index 04b0724..20a717d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -57,6 +57,15 @@ impl<'a, 'c> Parser<'a, 'c> { } } + fn consume_no_ws(&mut self, token: &str) -> bool { + if self.could_consume(token) { + self.offset += token.len(); + true + } else { + false + } + } + fn must_consume(&mut self, token: &str) -> Result<(), Error> { if self.could_consume(token) { @@ -227,7 +236,7 @@ impl<'a, 'c> Parser<'a, 'c> { } fn consume_keypath(&mut self) -> Result, Error> { - let key: String = if self.consume(".") { + let key: String = if self.consume_no_ws(".") { if self.consume("[") { let key = try!(self.must_consume_string_literal()); try!(self.must_consume("]")); diff --git a/src/query.rs b/src/query.rs index 4f5c963..8f27803 100644 --- a/src/query.rs +++ b/src/query.rs @@ -452,6 +452,10 @@ impl<'a> QueryResults<'a> { } } } else { + if self.limit == 0 { + return None; + } + self.limit -= 1; let dr = match self.get_next_result() { Some(dr) => dr, None => return None, From 4d82ee351beba6eef1fa8de08943816ccb2296d5 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 3 May 2017 17:56:52 -0700 Subject: [PATCH 109/122] Add `find *` to allow the returning of every document in the index Because sometimes you want to perform aggregations and ordering on all documents. This will allow for that. --- repl-tests/query_basic.noise | 44 ++++++++++++++++++++++++++++++++++++ src/filters.rs | 42 +++++++++++++++++++++++++++++++++- src/json_shred.rs | 10 ++++++-- src/key_builder.rs | 18 ++++++++++++++- src/parser.rs | 5 +++- src/snapshot.rs | 28 +++++++++++++++++++++++ 6 files changed, 142 insertions(+), 5 deletions(-) diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise index b1c7b3c..d776966 100644 --- a/repl-tests/query_basic.noise +++ b/repl-tests/query_basic.noise @@ -289,4 +289,48 @@ return .pref; null ] +# all docs + +find *; +[ +"1", +"10", +"11", +"12", +"13", +"14", +"15", +"16", +"2", +"3", +"4", +"5", +"6", +"7", +"8", +"9" +] + +find * +order score() +return [._id, score()]; +[ +["9",1], +["8",1], +["7",1], +["6",1], +["5",1], +["4",1], +["3",1], +["2",1], +["16",1], +["15",1], +["14",1], +["13",1], +["12",1], +["11",1], +["10",1], +["1",1] +] + diff --git a/src/filters.rs b/src/filters.rs index cd8c85f..c96605e 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -7,7 +7,7 @@ use error::Error; use key_builder::KeyBuilder; use query::{DocResult, QueryScoringInfo}; use json_value::JsonValue; -use snapshot::{Snapshot, DocResultIterator, Scorer, JsonFetcher}; +use snapshot::{Snapshot, DocResultIterator, Scorer, JsonFetcher, AllDocsIterator}; use rocksdb::{self, DBIterator, IteratorMode}; pub trait QueryRuntimeFilter { @@ -34,6 +34,46 @@ pub enum RangeOperator { Null, } + + +pub struct AllDocsFilter { + iter: AllDocsIterator, +} + +impl AllDocsFilter { + pub fn new(snapshot: &Snapshot) -> AllDocsFilter { + AllDocsFilter { iter: snapshot.new_all_docs_iterator() } + } +} + +impl QueryRuntimeFilter for AllDocsFilter { + fn first_result(&mut self, _start: &DocResult) -> Option { + self.next_result() + } + + fn next_result(&mut self) -> Option { + if let Some(mut dr) = self.iter.next() { + dr.add_score(1, 1.0); + Some(dr) + } else { + None + } + } + + fn prepare_relevancy_scoring(&mut self, mut qsi: &mut QueryScoringInfo) { + qsi.num_terms += 1; + qsi.sum_of_idt_sqs += 1.0; + } + + fn check_double_not(&self, _parent_is_neg: bool) -> Result<(), Error> { + Ok(()) + } + + fn is_all_not(&self) -> bool { + false + } +} + pub struct StemmedWordFilter { iter: DocResultIterator, scorer: Scorer, diff --git a/src/json_shred.rs b/src/json_shred.rs index 5842801..acc65e0 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -274,9 +274,12 @@ impl Shredder { } self.shredded_key_values = BTreeMap::new(); - let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); + let key = KeyBuilder::id_to_seq_key(self.doc_id.as_ref().unwrap()); try!(batch.put(&key.into_bytes(), &seq.to_string().as_bytes())); + let key = KeyBuilder::seq_key(seq); + try!(batch.put(&key.into_bytes(), b"")); + Ok(()) } @@ -310,7 +313,10 @@ impl Shredder { } try!(batch.delete(&key.as_bytes())); } - let key = self.kb.id_to_seq_key(self.doc_id.as_ref().unwrap()); + let key = KeyBuilder::id_to_seq_key(self.doc_id.as_ref().unwrap()); + try!(batch.delete(&key.into_bytes())); + + let key = KeyBuilder::seq_key(seq); try!(batch.delete(&key.into_bytes())); Ok(()) } diff --git a/src/key_builder.rs b/src/key_builder.rs index 0ff4a8b..b5fa3ee 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -61,13 +61,29 @@ impl KeyBuilder { string } - pub fn id_to_seq_key(&self, id: &str) -> String { + pub fn id_to_seq_key(id: &str) -> String { let mut str = String::with_capacity(id.len() + 1); str.push('I'); str.push_str(&id); str } + pub fn seq_key(seq: u64) -> String { + let seq = seq.to_string(); + let mut str = String::with_capacity(seq.len() + 1); + str.push('S'); + str.push_str(&seq); + str + } + + pub fn parse_seq_key(key: &str) -> Option { + if key.starts_with("S") { + Some(key[1..].parse().unwrap()) + } else { + None + } + } + /// Build the index key that corresponds to a number primitive pub fn number_key(&self, seq: u64) -> String { let mut string = String::with_capacity(100); diff --git a/src/parser.rs b/src/parser.rs index 20a717d..38f8152 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -15,7 +15,7 @@ use returnable::{Returnable, RetValue, RetObject, RetArray, RetLiteral, RetBind, ReturnPath}; use filters::{QueryRuntimeFilter, ExactMatchFilter, StemmedWordFilter, StemmedWordPosFilter, StemmedPhraseFilter, DistanceFilter, AndFilter, OrFilter, BindFilter, BoostFilter, - NotFilter, RangeFilter, RangeOperator}; + NotFilter, RangeFilter, RangeOperator, AllDocsFilter}; use snapshot::Snapshot; @@ -559,6 +559,9 @@ impl<'a, 'c> Parser<'a, 'c> { if !self.consume("find") { return Err(Error::Parse("Missing 'find' keyword".to_string())); } + if self.consume("*") { + return Ok(Box::new(AllDocsFilter::new(&self.snapshot))); + } self.not_object() } diff --git a/src/snapshot.rs b/src/snapshot.rs index 8be8709..9408691 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -55,6 +55,12 @@ impl<'a> Snapshot<'a> { pub fn new_iterator(&self) -> DBIterator { self.rocks.iterator(IteratorMode::Start) } + + pub fn new_all_docs_iterator(&self) -> AllDocsIterator { + let mut iter = self.rocks.iterator(IteratorMode::Start); + iter.set_mode(IteratorMode::From(b"S", rocksdb::Direction::Forward)); + AllDocsIterator { iter: iter } + } } pub struct DocResultIterator { @@ -400,3 +406,25 @@ impl JsonFetcher { } } } + +pub struct AllDocsIterator { + iter: DBIterator, +} + +impl AllDocsIterator { + pub fn next(&mut self) -> Option { + match self.iter.next() { + Some((k, _v)) => { + let key = unsafe { str::from_utf8_unchecked(&k) }; + if let Some(seq) = KeyBuilder::parse_seq_key(key) { + let mut dr = DocResult::new(); + dr.seq = seq; + Some(dr) + } else { + None + } + } + None => None, + } + } +} From d59798ce64a200aeea93323e2bd21e003e1c033f Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 3 May 2017 20:59:22 -0700 Subject: [PATCH 110/122] Bump version number And publish to cargo. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f35548a..a435a1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "noise_search" -version = "0.2.1" +version = "0.3.0" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" homepage = "https://github.com/pipedown/noise.git" From 24bad63b9e1ab3cc4854e300cd263b727f410ef1 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 5 May 2017 13:57:40 -0700 Subject: [PATCH 111/122] Syntax changes, get rid of `*` Now to match on all docs, use `find {}`. When returning all of a field of objects in an array, use foo[].bar --- repl-tests/query_basic.noise | 8 ++++---- src/parser.rs | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/repl-tests/query_basic.noise b/repl-tests/query_basic.noise index d776966..39106f5 100644 --- a/repl-tests/query_basic.noise +++ b/repl-tests/query_basic.noise @@ -257,7 +257,7 @@ add {"_id":"15", "a":[{"b":[{"c":1},{"c":2},{"c":3}]},{"b":[{"c":4},{"c":5},{"c" "15" find {"_id": =="15"} -return .a[*].b[*].c; +return .a[].b[].c; [ [[1,2,3],[4,5,6]] ] @@ -267,7 +267,7 @@ return .a[*].b[*].c; add {"_id":"16", "type": "nested", "a":[{"b":[{"b":1},{"c":2},{"b":3}]},{"b":[{"c":4},{"c":5},{"c":6}]}]}; "16" find {"_id": =="16"} -return .a[*].b[*].c; +return .a[].b[].c; [ [[2],[4,5,6]] ] @@ -291,7 +291,7 @@ null # all docs -find *; +find {}; [ "1", "10", @@ -311,7 +311,7 @@ find *; "9" ] -find * +find {} order score() return [._id, score()]; [ diff --git a/src/parser.rs b/src/parser.rs index 38f8152..1096396 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -260,14 +260,14 @@ impl<'a, 'c> Parser<'a, 'c> { if self.consume("[") { if let Some(index) = try!(self.consume_integer()) { ret_path.push_array(index as u64); + try!(self.must_consume("]")); } else { - if self.consume("*") { + if self.consume("]") { ret_path.push_array_all(); } else { return Err(Error::Parse("Expected array index integer or *.".to_string())); } } - try!(self.must_consume("]")); } else if self.consume(".") { if let Some(key) = self.consume_field() { ret_path.push_object_key(key); @@ -559,9 +559,6 @@ impl<'a, 'c> Parser<'a, 'c> { if !self.consume("find") { return Err(Error::Parse("Missing 'find' keyword".to_string())); } - if self.consume("*") { - return Ok(Box::new(AllDocsFilter::new(&self.snapshot))); - } self.not_object() } @@ -575,6 +572,9 @@ impl<'a, 'c> Parser<'a, 'c> { fn object<'b>(&'b mut self) -> Result, Error> { if self.consume("{") { + if self.consume("}") { + return Ok(Box::new(AllDocsFilter::new(&self.snapshot))); + } let mut left = try!(self.obool()); try!(self.must_consume("}")); From 2b134eb0e4bd28c32c5ca38dceef53fa85ef5623 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Fri, 5 May 2017 14:12:39 -0700 Subject: [PATCH 112/122] Bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index a435a1d..d6894f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "noise_search" -version = "0.3.0" +version = "0.4.0" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" homepage = "https://github.com/pipedown/noise.git" From e73c30d30dc5a5a0fdd66db6ef307420c74c48e1 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sun, 7 May 2017 23:45:02 -0700 Subject: [PATCH 113/122] Documentation for query language Moving out of node-noise project README.md --- query_language_reference.md | 773 ++++++++++++++++++++++++++++++++++++ 1 file changed, 773 insertions(+) create mode 100644 query_language_reference.md diff --git a/query_language_reference.md b/query_language_reference.md new file mode 100644 index 0000000..ce292de --- /dev/null +++ b/query_language_reference.md @@ -0,0 +1,773 @@ +# Noise Query Language + +The Noise query language is an expressive example-based syntax for finding documents, formatting and returning specific information in the documents, performing relevancy scoring, ordering and aggregations. + +## Find Clause + +All queries have a `find` clause followed by an example based query syntax. It is a combination of expressions that consist of three parts: The key to query, an operator and the value to match. + +This query will return the `_id` of every document with a `{"foo": "bar",...}` + +``` +find {foo: == "bar"} +``` + +This query will match all documents in the index and return their `_id`. + +``` +find {} +``` + +To match on multiple fields, or even nested fields, simply construct the same json structure in query form. + +Match on two fields: + +``` +find {foo: == "bar", fizz: == "buzz"} +``` + +Match on fields, one nested within another: + +``` +find {foo: == "bar", fizz: {fazz: == "buzz"}} +``` + +### Word Match Operator + +`~=` is the full text match operator. Use it find a word in a text field. + +``` +find {body: ~= "word"} +``` + +Put multiple words in the quoted string to find a phrase in the field. + +``` +find {body: ~= "a multi word sentence"} +``` + +To find words that are within a specified distance of each other, put the the maximum word distance in the operator. This example will return results where each word is with the 50 words of the others. + +``` +find {body: ~50= "bitcoin gold price"} +``` + +### Comparison Operators + +Noise supports the following comparison operators: + +|Operator|Description|Types +---------|-----------|----- +|`==`|Equality|Strings, Numbers, true, false, null +|`>`|Less Than|Numbers +|`<`|Greater Than|Numbers +|`>=`|Less Than or Equal|Numbers +|`<=`|Greater Than or Equal|Numbers + +Noise does not do type conversions of datatypes. Strings only compare with strings, number only compare with numbers, etc. + +### Finding Things in Arrays + +Let's say you have document like this with text in an array: + +``` +{"foo": ["bar", "baz"]} +``` + +To find element with value `"baz"` in the array, use syntax like this: + +``` +find {foo:[ == "baz"]} +``` + +If objects are nested in array, like this: + +``` +{"foo": [{"fiz": "bar"}, {"fiz": "baz"}]} +``` + +To find a `{"fiz": "baz"}` in the array, use syntax like this: + +``` +find {foo: [{fiz: == "baz"}]} +``` + +### Boolean Logic and Parens + +Noise has full support for boolean logic using `&&` (logical AND) and `||` (logical OR) operators and nesting logic with parens. + +The comma `,` in objects is actually the same as the `&&` operator. They can be used interchangeably for which ever is more readable. + +Find a doc with `"foo"` or `"bar"` in the `body`: + +``` +find {body: ~= "foo" || body: ~= "bar"} +``` + +Find a doc that has `"foo"` or `"bar"` and has `"baz"` or `"biz"` in the `body`: + +``` +find {(body: ~= "foo" || body: ~= "bar") && + (body: ~= "baz" || body: ~= "biz")} +``` + +The fields can be nested as well. Find a doc where either the nested field `fiz` contains either `"baz"` or `"biz"`. + +``` +find {foo: {fiz: ~= "baz" || fiz: ~= "biz"}} +``` + + +### Not Operator + +Use the `!` (logical NOT) to exclude matching criteria. + +Find docs where `foo` has value `"bar"` and `fab` does not have value `"baz"`: + +``` +find {foo: == "bar", fab: !== "baz"} +``` + +You can use logical not with parentheses to negate everything enclosed. This example finds docs where `foo` has value `"bar"` and `fab` does not have value `"baz"` or `"biz"`': + +``` +find {foo: == "bar", !(fab: == "baz" || fab: == "biz")} +``` + +You cannot have every clause be negated. Query need at least one non-negated clauses. + +Illegal: + +``` +find {foo: !~= "bar" && foo: !~= "baz"} +``` + +Illegal: + +``` +find {!(foo: ~= "bar" && foo: ~= "baz"}) +``` + +Also double negation is not allowed. + +Illegal: + +``` +find {foo ~= "waz" && !(foo: ~= "bar" && foo: !~= "baz"}) +``` + +### Relevancy Scoring and Boosting + +Relevancy scoring uses a combination boolean model and Term Frequency/Inverse Document Frequency (TF/IDF) scoring system, very similar to Lucene and Elastic Search. The details of the scoring model is beyond the scope of the document. + +To return results in relevancy score order (most relevant first), simply use the order clause with the `score()` function. + +``` +find {subject: ~= "hammer" || body: ~= "hammer"} +order score() desc +``` + +But if you want matches in `subject` fields to score higher than in `body` fields, you can boost the score with the `^` operator. It is a multiplier of the scores of associated clauses. + +This boosts `subject` matches by 2x: + +``` +find {subject: ~= "hammer"^2 || body: ~= "hammer"} +order score() desc +``` + +You can also boost everything in parenthesis or objects or arrays: + +``` +find {(subject: ~= "hammer" || subject: ~= "nails")^2 || + body: ~= "hammer" || body: ~= "nails"} +order score() desc +``` +Another way to express the same thing: + +``` +find {subject: ~= "hammer" || subject: ~= "nails"}^2 || + {body: ~= "hammer" || body: ~= "nails"} +order score() desc +``` + + +## Order Clause + +To return results in a particular order, use the order clause. + +This will order results ascending based on the contents of the `baz` field: + +``` +find {foo: == "bar"} +order .baz +``` + +If `baz` doesn't existing, `null` be the value used for ordering. + +This will order `baz` descending: + +``` +find {foo: == "bar"} +order .baz +``` + +This will order `baz` ascending: + +``` +find {foo: == "bar"} +order .baz asc +``` + +This will order `baz` ascending with default value of `1` if no `baz` value exists: + +``` +find {foo: == "bar"} +order .baz asc default=1 +``` + +This will order `baz` ascending, for values of `baz` that are the same, those results are now ordered as `biz` ascending. + +``` +find {foo: == "bar"} +order .baz asc, .biz dsc +``` + +## Return Clause + +The return clause is how data or scoring is returned to the client. You can extract the whole document, a single field, multiple fields, and perform aggregations. + +For this section these examples the following document will be used: + +```json +{ + "_id": "example", + "foo": "bar", + "baz": {"biz": "bar"}, + "faz": [ + {"fiz": 213}, + {"biz": 5463}, + {"biz": 73} + ] +} +``` + +### Basic Dot Notation + +A leading dot indicates the root of the document. To return the whole document, place a single dot in return clause. + +This will return the whole document for each document found. + +```Thrift +find + {foo: == "bar"} +return + . +// [{ +// "_id": "example", +// "foo": "bar", +// "baz": {"biz": "bar"}, +// "faz": [ +// {"fiz": 213}, +// {"biz": 5463}, +// {"biz": 73} +// ] +// }] +``` + +To return a specific field, place the field name after the dot: + +```Thrift +find {foo: == "bar"} +return .baz +// [{"biz": "bar"}] +``` + +To return a nested field, use another dot: + +```Thrift +find {foo: == "bar"} +return .baz.biz +// ["bar"] +``` + +To return an array element, use the array notation: + +```Thrift +find {foo: == "bar"} +return .faz[1] +// [{"biz": 5463}] +``` + +To return an object field nested in the array, add a dot after the array notation: + +```Thrift +find {foo: == "bar"} +return .faz[1].biz +// [5463] +``` + +To return multiple values, embed the return paths in other JSON structures. + +For each match this example returns 2 values inside an array: + +```Thrift +find {foo: == "bar"} +return [.baz, .faz] +// [[ +// {"biz": "bar"}, +// [{"fiz": 213}, {"biz": 5463}, {"biz": 73}] +// ]] +``` + +For each match this example return 2 values inside an object: + +```Thrift +find {foo: == "bar"} +return {baz: .baz, faz: .faz} +// [{ +// "baz": {"biz": "bar"}, +// "faz": [{"fiz": 213}, {"biz": 5463}, {"biz": 73}] +// }] +``` + +### Missing Values + +Sometimes you'll want to return a field that doesn't exist on a matching document. When that happens, `null` is returned. + +If you'd like a different value to be returned, use the `default=` option, like this: + +```Thrift +find {foo: == "bar"} +return .hammer default=0 +// [0] +``` + +Each returned value can have a default as well. + +```Thrift +find {foo: == "bar"} +return {baz: .baz default=0, hammer: .hammer default=1} +// [{ +// "baz": {"biz": "bar"}, +// "hammer": 1 +// }] +``` + + + +### Return a Field from All Objects Inside an Array + +If want to return a nested field inside an array, but for each object in the array, use the `[]` with no index. + +This will return each biz field as an array of values: + +```Thrift +find {foo: == "bar"} +return .faz[].biz +// [[5463, 73]] +``` + +### Bind Variables: Return Only Matched Array Elements + +If you are searching for nested values or objects nested in arrays, and you want to return only the match objects, use the bind syntax before the array in the query. The bound value is always an array, as multiple elements might match. + +Say you have a document like this: + +```json +{ + "_id": "a", + "foo": [ + {"fiz": "bar", "val": 4}, {"fiz": "baz", "val": 7} + ], + "bar": [ + {"fiz": "baz", "val": 9} + ] +} + +``` + +You want to return the object where `{"fiz": "bar", ...}` (but not the others), use you a bind variable (`var::[...]`), like this: + +```Thrift +find {foo: x::[{fiz: == "bar"}]} +return x +// [[{"fiz": "bar", "val": 4}]] +``` + +If instead you want to return the `val` field, add the `.val` to the bind variable like this: + +```Thrift +find {foo: x::[{fiz: == "bar"}]} +return x.val +// [[4]] +``` + +You can have any number of bind variables: + +```Thrift +find {foo: x::[{fiz: == "bar"}], foo: y::[{fiz: == "baz"}]} +return [x.val, y.val] +// [[[4], [7]]] +``` + +The same query as the previous one, but returning an object: + +```Thrift +find {foo: x::[{fiz: == "bar"}], foo: y::[{fiz: == "baz"}]} +return {x: x.val, y: y.val} +// [{"x": [4], "y": [7]}] +``` + +You can reuse bind variables in different clauses and they'll be combined: + +```Thrift +find {foo: x::[{fiz: == "baz"}] || bar: x::[{fiz: == "baz"}]} +return {x: x.val} +// [{"x": [7, 9]}] +``` + +## Limit Clause + +To limit the number of results, use a limit clause at the end of the query. + +This limits the results to the first 10 found: + +```Thrift +find {foo: == "bar"} +return .baz +limit 10 +``` + + +## Grouping and Aggregation + +Noise includes ways to group rows together and aggregate values. + +Values you want to group together use `group(...)` function in the `return` clause. + +For values that are grouped together you can then perform aggregations on other values and return that aggregation. If a group function is used, all other fields must also be grouped or aggregated. + +The aggregation functions available are: + +|function | Description| +---------------|------------- +|`array(...)`|Returns all values in the group as values in an array.| +|`array_flat(...)`|Returns all values in the group as values in an array. However if an array is encountered it extracts all the values inside the array (and further nested arrays) and returns them as a singe flat array| +|`avg(...)`|Averages numeric values in the group. If numeric values are in arrays, it extracts the values from the arrays. Even if arrays are nested in arrays, it extracts through all levels of nested arrays and averages them. | +|`count()`| Returns the count of the grouped rows for each grouping. | +|`concat(... [sep="..."])`| Returns all the strings in the group as a single concatenated string. Other value types are ignored. Use the optional `sep="..."` to specify a separator between string values.| +|`max(...)`|Returns the maximum value in the group. See type ordering below to see how different types are considered. | +|`max_array(...)`|Returns the maximum value in the group, if array is encountered the values inside the array are extracted and considered.| +|`min(...)`|Returns the minimum value in the group. See type ordering below to see how different types are considered.| +|`min_array(...)`|Returns the minimum value in the group, if array is encountered the values inside the array are extracted and considered.| +|`sum(...)`|Sums numeric values in the group. If numeric values are in arrays, it extracts the values from the arrays. Even if arrays are nested in arrays, it extracts through all levels of nested arrays and sums them.| + +To perform grouping and/or aggregate, each field returned will need either a grouping or a aggregate function. It's an error it on some returned fields but not others. + +Groupings are are ordered first on the leftmost `group(...)` function, then on the next leftmost, etc. + +You do not need to use `group(...)` to perform aggregates. If you have no `group(...)` defined, then all rows are aggregated into a single row. + + + +### Max/Min Type Ordering +The ordering of types for `max(...)` and `min(...)` is as follows: + +null < false < true < number < string < array < object + + +## Group/Aggregate Examples + + +Let's say we have documents like this: + +```json +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group1", "baz": "a", "bar": 1} +{"foo":"group1", "baz": "b", "bar": 2} +{"foo":"group1", "baz": "c", "bar": 3} +{"foo":"group2", "baz": "a", "bar": "a"} +{"foo":"group2", "baz": "a", "bar": "b"} +{"foo":"group2", "baz": "b", "bar": "a"} +{"foo":"group2", "baz": "b", "bar": "b"} +{"foo":"group2", "baz": "a", "bar": "a"} +{"foo":"group2", "baz": "a", "bar": "c"} +{"foo":"group2", "baz": "b", "bar": "d"} +{"foo":"group2", "baz": "b", "bar": "e"} +{"foo":"group2", "baz": "a", "bar": "f"} +{"foo":"group3", "baz": "a", "bar": "a"} +("foo":"group3", "bar": "b"} +{"foo":"group3", "baz": "b", "bar": "a"} +{"foo":"group3", "baz": "b", "bar": "b"} +{"foo":"group3", "baz": "a", "bar": "a"} +{"foo":"group3", "baz": "a" } +{"foo":"group3", "baz": "b", "bar": "d"} +{"foo":"group3", "baz": "b", "bar": "e"} +{"foo":"group3", "baz": "a", "bar": "f"} +``` + +### Count + +Query: +``` +find {foo: == "group1"} +return {baz: group(.baz), count: count()} +``` +Results: + +```json +{"baz":"a","bar":4} +{"baz":"b","bar":4} +{"baz":"c","bar":4} + +``` + +### Sum + +Query: + +``` +find {foo: == "group1"} +return {baz: group(.baz), bar: sum(.bar)} +``` + +Results: + +```json +{"baz":"a","bar":4} +{"baz":"b","bar":8} +{"baz":"c","bar":12} + +``` + +### Avg + +Query: + +``` +find {foo: == "group1"} +return {avg: avg(.bar)} +``` + +Results: + +```json +{"bar":2} +``` + +### Concat + +Query: + +``` +find {foo: =="group1"} +return {baz: group(.baz), concat: concat(.baz sep="|")} +``` + +Results: + +```json +{"baz":"a","concat":"a|a|a|a"} +{"baz":"b","concat":"b|b|b|b"} +{"baz":"c","concat":"c|c|c|c"} +``` + +### Max + +Query: + +``` +find {foo: =="group1"} +return {max: max(.bar)} +``` +Results: + +```json +{"max":3} +``` + +Query: + +``` +find {foo: =="group1"} +return {max: max(.baz)} +``` + +Results: + +```json +{"max":"c"} +``` + +### Min + +Query: + +``` +find {foo: =="group1"} +return {min: min(.bar)} +``` + +Results: + +```json +{"min":1} +``` + +### Group Ordering + +Query: + +``` +find {foo: =="group2"} +return [group(.baz order=asc), group(.bar order=desc), count()] +``` + +Results: + +```json +["a","f",1] +["a","c",1] +["a","b",1] +["a","a",2] +["b","e",1] +["b","d",1] +["b","b",1] +["b","a",1] +``` + +### Default Values + +Query: + +``` +find {foo: =="group2"} +return [group(.baz order=asc) default="a", group(.bar order=desc) default="c", count()]; +``` + +Results: + +```json +["a","f",1] +["a","c",1] +["a","b",1] +["a","a",2] +["b","e",1] +["b","d",1] +["b","b",1] +["b","a",1] +``` + +### Arrays + +When performing aggregations on arrays, some functions will extract values out of the arrays (and arrays nested in arrays). + +We have documents like this: + +```json +{"foo":"array1", "baz": ["a","b",["c","d",["e"]]]} +{"foo":"array1", "baz": ["f","g",["h","i"],"j"]} +{"foo":"array2", "baz": [1,2,[3,4,[5]]]} +{"foo":"array2", "baz": [6,7,[8,9],10]}; +``` + +Query: + +``` +find {foo: =="array1"} +return array(.baz) +``` + +Results: + +```json +[["f","g",["h","i"],"j"],["a","b",["c","d",["e"]]]] +``` + +Query: + +``` +find {foo: =="array1"} +return array_flat(.baz) +``` + +Results: + +```json +["f","g","h","i","j","a","b","c","d","e"] +``` + +Query: + +``` +find {foo: =="array1"} +return max(.baz) +``` + +Results: + +```json +["f","g",["h","i"],"j"] +``` + +Query: + +```json +find {foo: =="array1"} +return max_array(.baz) +``` + +Results: + +```json +"j" +``` + +Query: + +``` +find {foo: =="array1"} +return min_array(.baz) +``` + +Results: + +```json +"a" +``` + +Query: + +``` +find {foo: =="array2"} +return avg(.baz) +``` + +Results: + +```json +5.5 +``` + +Query: + +``` + +find {foo: =="array2"} +return sum(.baz) +``` + +Results: + +```json +55 +``` \ No newline at end of file From ac433a8671cc7f1feb2977818a6778014fdc6ec0 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 8 May 2017 00:00:52 -0700 Subject: [PATCH 114/122] Fix operator spacing and bad json code spec --- query_language_reference.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/query_language_reference.md b/query_language_reference.md index ce292de..7a75c4c 100644 --- a/query_language_reference.md +++ b/query_language_reference.md @@ -569,7 +569,7 @@ Results: Query: ``` -find {foo: =="group1"} +find {foo: == "group1"} return {baz: group(.baz), concat: concat(.baz sep="|")} ``` @@ -586,7 +586,7 @@ Results: Query: ``` -find {foo: =="group1"} +find {foo: == "group1"} return {max: max(.bar)} ``` Results: @@ -598,7 +598,7 @@ Results: Query: ``` -find {foo: =="group1"} +find {foo: == "group1"} return {max: max(.baz)} ``` @@ -613,7 +613,7 @@ Results: Query: ``` -find {foo: =="group1"} +find {foo: == "group1"} return {min: min(.bar)} ``` @@ -628,7 +628,7 @@ Results: Query: ``` -find {foo: =="group2"} +find {foo: == "group2"} return [group(.baz order=asc), group(.bar order=desc), count()] ``` @@ -683,7 +683,7 @@ We have documents like this: Query: ``` -find {foo: =="array1"} +find {foo: == "array1"} return array(.baz) ``` @@ -696,7 +696,7 @@ Results: Query: ``` -find {foo: =="array1"} +find {foo: == "array1"} return array_flat(.baz) ``` @@ -709,7 +709,7 @@ Results: Query: ``` -find {foo: =="array1"} +find {foo: == "array1"} return max(.baz) ``` @@ -721,8 +721,8 @@ Results: Query: -```json -find {foo: =="array1"} +``` +find {foo: == "array1"} return max_array(.baz) ``` @@ -735,7 +735,7 @@ Results: Query: ``` -find {foo: =="array1"} +find {foo: == "array1"} return min_array(.baz) ``` @@ -770,4 +770,4 @@ Results: ```json 55 -``` \ No newline at end of file +``` From b0ed72723de0a5339c948872e29428abb964c7f0 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 8 May 2017 12:32:09 -0700 Subject: [PATCH 115/122] Add link to query language reference --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1caf545..386c1f9 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ nature of JSON, and will allow: * Case sensitive exact word and sentence match * Arbitrary boolean nesting * Greater than/less Than matching + +[Query Langauge Reference here](https://github.com/pipedown/noise/blob/master/query_language_reference.md) Installation From 1aed83039a5b866d2400b8e488807f41d50a4813 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 8 May 2017 12:38:19 -0700 Subject: [PATCH 116/122] Bump version number --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d6894f7..dcd1992 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "noise_search" -version = "0.4.0" +version = "0.4.1" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" -homepage = "https://github.com/pipedown/noise.git" +homepage = "http:/noisesearch.org" license = "MIT OR Apache-2.0" readme = "README.md" description = "Nested Object Inverted Search Engine" From 3f43618e4004aca87efad5d097792be28953ca75 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 8 May 2017 20:29:20 -0700 Subject: [PATCH 117/122] Fix serious performance problem with count() We forgot to do an early return for value fetch when count() is used. It would instead load the whole document. --- src/returnable.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/returnable.rs b/src/returnable.rs index 84e8db2..91bca15 100644 --- a/src/returnable.rs +++ b/src/returnable.rs @@ -299,6 +299,7 @@ impl Returnable for RetValue { if Some((AggregateFun::Count, JsonValue::Null)) == self.ag { //don't fetch anything for count(). just stick in a null result.push_back(JsonValue::Null); + return; } let mut kb = KeyBuilder::new(); if let Some(json) = fetcher.fetch(seq, &mut kb, &self.rp) { From aa2de27f32cae31ef2375204e6e6a4245ffd6ae3 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Mon, 8 May 2017 20:36:19 -0700 Subject: [PATCH 118/122] bump version number for performance bug fix --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index dcd1992..41835e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "noise_search" -version = "0.4.1" +version = "0.4.2" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" homepage = "http:/noisesearch.org" From 0fe09cb18109f28f37d8f8e3149623ea54689698 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 20 May 2017 21:37:35 -0700 Subject: [PATCH 119/122] Ensure successful cargo build before updating repl-tests Otherwise you might run old code because you forgot to compile the debug build. --- update-test-repl.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/update-test-repl.sh b/update-test-repl.sh index e8e2937..31c2489 100755 --- a/update-test-repl.sh +++ b/update-test-repl.sh @@ -6,7 +6,11 @@ DIRNAME="$(dirname ${SCRIPTPATH})" NOISE="${DIRNAME}/target/debug/noise_search" REPL_TEST_DIR="${DIRNAME}/repl-tests" - +cargo build +exit_status=$? +if [ $exit_status -ne 0 ]; then + exit $exit_status +fi if [[ ! -f "${NOISE}" ]]; then echo "Can't find noise binary, looked at ${NOISE}" exit 1 From a30dbe48345a595db2d88b770480454ac4782c99 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Sat, 20 May 2017 21:42:19 -0700 Subject: [PATCH 120/122] change !== to != and bug fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Found an existing bug with logical not when testing this syntax change out, it would match even when it shouldn’t because it would run off the end of the array and assume it was a match. Also the semantics are now if the field doesn’t exist at all in a document, the != condition returns true. --- query_language_reference.md | 2 +- repl-tests/not.noise | 56 ++++++++++++++++++++++ src/filters.rs | 88 ++++++++++++++++++++++++++++------ src/parser.rs | 96 ++++++++++++++++++++++++++----------- src/query.rs | 10 ++++ 5 files changed, 207 insertions(+), 45 deletions(-) diff --git a/query_language_reference.md b/query_language_reference.md index 7a75c4c..95a7e16 100644 --- a/query_language_reference.md +++ b/query_language_reference.md @@ -125,7 +125,7 @@ Use the `!` (logical NOT) to exclude matching criteria. Find docs where `foo` has value `"bar"` and `fab` does not have value `"baz"`: ``` -find {foo: == "bar", fab: !== "baz"} +find {foo: == "bar", fab: != "baz"} ``` You can use logical not with parentheses to negate everything enclosed. This example finds docs where `foo` has value `"bar"` and `fab` does not have value `"baz"` or `"biz"`': diff --git a/repl-tests/not.noise b/repl-tests/not.noise index ff439e7..c6c8e06 100644 --- a/repl-tests/not.noise +++ b/repl-tests/not.noise @@ -22,6 +22,12 @@ add {"_id":"8", "baz": ["quick","fox"]}; "8" add {"_id":"9", "baz": ["quick","brown","fox"]}; "9" +add {"_id":"10", "baz": [["quick"],["brown"],["fox"]]}; +"10" +add {"_id":"11", "baz": [["brown"],["fox"]]}; +"11" +add {"_id":"12", "baz": [["fox"]]}; +"12" find {(bar: ~="fox" || bar: ~="brown") && (bar: !~="quick")} return ._id ; @@ -81,6 +87,56 @@ return ._id ; "7" ] +find {baz: [~="fox" || ~="brown"] && baz: [!~="fox"]} +return ._id ; +[ +"8", +"9" +] + +find {baz: [~="fox" || ~="brown"] && baz: [!="fox"]} +return ._id ; +[ +"8", +"9" +] + +# not a field that doesn't exist. +find {baz: [~="fox" || ~="brown"] && missing: ![~="fox"]} +return ._id ; +[ +"7", +"8", +"9" +] + +find {baz: [[~="brown"]] || baz: [[!~="fox"]]} +return ._id ; +[ +"1", +"2", +"3", +"4", +"5", +"6", +"7", +"8", +"9", +"10", +"11" +] + +find {baz: [[~="brown"]] && baz: [[!~="fox"]]} +return ._id ; +[ +"10", +"11" +] + +find {_id: == "12" && baz: [[!="fox"]]} +return ._id ; +[] + # Test for unallowable expressions find !{baz: [~="fox"]} diff --git a/src/filters.rs b/src/filters.rs index c96605e..83f19d4 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -890,17 +890,67 @@ impl<'a> QueryRuntimeFilter for OrFilter<'a> { pub struct NotFilter<'a> { + iter: DBIterator, filter: Box, last_doc_returned: Option, - array_depth: usize, + kb: KeyBuilder, } impl<'a> NotFilter<'a> { - pub fn new(filter: Box, array_depth: usize) -> NotFilter { + pub fn new(snapshot: &Snapshot, + filter: Box, + kb: KeyBuilder) + -> NotFilter<'a> { NotFilter { + iter: snapshot.new_iterator(), filter: filter, last_doc_returned: Some(DocResult::new()), - array_depth: array_depth, + kb: kb, + } + } + + fn is_a_not_match(&mut self, dr: &DocResult) -> bool { + let ret = match dr.last_segment_array_index() { + Some(&0) => { + // if we got a (not) match on the first array element, it's always a match + // but only if the document actually exists. + true + } + Some(_) => { + // if we got a (not) match on any other element, check to make sure the key exists. + // if not, it means other elements did a regular match and skipped them, then we + // ran off the end of the array. + let value_key = self.kb.value_key_from_doc_result(&dr); + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), + rocksdb::Direction::Forward)); + if let Some((key, _value)) = self.iter.next() { + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + KeyBuilder::is_keypath_prefix(&value_key, &key_str) + } else { + false + } + } + None => { + //not an array. always a (not) match. + true + } + }; + if ret { + // make sure we actually have a document. It's possible we matched a non-existent seq. + let mut kb = KeyBuilder::new(); + kb.push_object_key("_id"); + let value_key = kb.value_key_from_doc_result(dr); + self.iter + .set_mode(IteratorMode::From(value_key.as_bytes(), rocksdb::Direction::Forward)); + if let Some((key, _value)) = self.iter.next() { + let key_str = unsafe { str::from_utf8_unchecked(&key) }; + value_key == key_str + } else { + false + } + } else { + false } } } @@ -908,26 +958,34 @@ impl<'a> NotFilter<'a> { impl<'a> QueryRuntimeFilter for NotFilter<'a> { fn first_result(&mut self, start: &DocResult) -> Option { let mut start = start.clone_only_seq_and_arraypath(); + start.arraypath.resize(self.kb.arraypath_len(), 0); while let Some(dr) = self.filter.first_result(&start) { - if start.less(&dr, self.array_depth) { - self.last_doc_returned = Some(start.clone_only_seq_and_arraypath()); - return Some(start.clone_only_seq_and_arraypath()); + if start.less(&dr, self.kb.arraypath_len()) { + if self.is_a_not_match(&start) { + self.last_doc_returned = Some(start.clone_only_seq_and_arraypath()); + return Some(start.clone_only_seq_and_arraypath()); + } else { + start.increment_first(self.kb.arraypath_len()); + } + } else { + start.increment_last(self.kb.arraypath_len()); } - start.increment_last(self.array_depth); } self.last_doc_returned = None; - Some(start) + if self.is_a_not_match(&start) { + Some(start) + } else { + None + } } fn next_result(&mut self) -> Option { - let next = if let Some(ref last_doc_returned) = self.last_doc_returned { - let mut next = last_doc_returned.clone_only_seq_and_arraypath(); - next.increment_last(self.array_depth); - next + if let Some(mut next) = self.last_doc_returned.take() { + next.increment_last(self.kb.arraypath_len()); + self.first_result(&next) } else { - return None; - }; - self.first_result(&next) + None + } } fn prepare_relevancy_scoring(&mut self, _qsi: &mut QueryScoringInfo) { diff --git a/src/parser.rs b/src/parser.rs index 1096396..3f894e6 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -158,9 +158,12 @@ impl<'a, 'c> Parser<'a, 'c> { } } - fn consume_aggregate - (&mut self) - -> Result, ReturnPath, JsonValue)>, Error> { + fn consume_aggregate(&mut self) + -> Result, /*optional bind var name*/ + ReturnPath, + JsonValue)>, + Error> { let offset = self.offset; let mut aggregate_fun = if self.consume("group") { AggregateFun::GroupAsc @@ -564,7 +567,8 @@ impl<'a, 'c> Parser<'a, 'c> { fn not_object<'b>(&'b mut self) -> Result, Error> { if self.consume("!") { - Ok(Box::new(NotFilter::new(try!(self.object()), self.kb.arraypath_len()))) + let filter = try!(self.object()); + Ok(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone()))) } else { self.object() } @@ -597,7 +601,8 @@ impl<'a, 'c> Parser<'a, 'c> { fn parens<'b>(&'b mut self) -> Result, Error> { if self.consume("!") { - return Ok(Box::new(NotFilter::new(try!(self.parens()), self.kb.arraypath_len()))); + let filter = try!(self.parens()); + return Ok(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone()))); } try!(self.must_consume("(")); let filter = try!(self.object()); @@ -645,7 +650,7 @@ impl<'a, 'c> Parser<'a, 'c> { let offset = self.offset; if self.consume("!") { if let Some(f) = try!(self.oparens()) { - return Ok(Some(Box::new(NotFilter::new(f, self.kb.arraypath_len())))); + return Ok(Some(Box::new(NotFilter::new(&self.snapshot, f, self.kb.clone())))); } else { self.offset = offset; return Ok(None); @@ -675,10 +680,28 @@ impl<'a, 'c> Parser<'a, 'c> { } fn compare<'b>(&'b mut self) -> Result, Error> { - if self.consume("!") { - return Ok(Box::new(NotFilter::new(try!(self.compare()), self.kb.arraypath_len()))); + if let Some(filter) = try!(self.equal()) { + Ok(filter) + } else if let Some(filter) = try!(self.stemmed()) { + Ok(filter) + } else { + if self.consume(">") { + let min = try!(self.consume_range_operator()); + let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), Some(min), None); + Ok(Box::new(filter)) + } else if self.consume("<") { + let max = try!(self.consume_range_operator()); + let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), None, Some(max)); + Ok(Box::new(filter)) + } else { + Err(Error::Parse("Expected comparison operator".to_string())) + } } - if self.consume("==") { + } + + fn equal<'b>(&'b mut self) -> Result>, Error> { + let not_equal = self.consume("!="); + if not_equal || self.consume("==") { let json = try!(self.must_consume_json_primitive()); let boost = try!(self.consume_boost()); let filter: Box = match json { @@ -726,21 +749,32 @@ impl<'a, 'c> Parser<'a, 'c> { } _ => panic!("Exact match on other JSON types is not yet implemented!"), }; - Ok(filter) - } else if self.consume("~=") { + if not_equal { + Ok(Some(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone())))) + } else { + Ok(Some(filter)) + } + } else { + Ok(None) + } + } + + fn stemmed<'b>(&'b mut self) -> Result>, Error> { + let not_stemmed = self.consume("!~="); + if not_stemmed || self.consume("~=") { // regular search let literal = try!(self.must_consume_string_literal()); let boost = try!(self.consume_boost()); let stems = Stems::new(&literal); let stemmed_words: Vec = stems.map(|stem| stem.stemmed).collect(); - match stemmed_words.len() { + let filter: Box = match stemmed_words.len() { 0 => panic!("Cannot create a StemmedWordFilter"), 1 => { - Ok(Box::new(StemmedWordFilter::new(&self.snapshot, - &stemmed_words[0], - &self.kb, - boost))) + Box::new(StemmedWordFilter::new(&self.snapshot, + &stemmed_words[0], + &self.kb, + boost)) } _ => { let mut filters: Vec = Vec::new(); @@ -751,10 +785,15 @@ impl<'a, 'c> Parser<'a, 'c> { boost); filters.push(filter); } - Ok(Box::new(StemmedPhraseFilter::new(filters))) + Box::new(StemmedPhraseFilter::new(filters)) } + }; + if not_stemmed { + Ok(Some(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone())))) + } else { + Ok(Some(filter)) } - } else if self.consume("~") { + } else if not_stemmed || self.consume("~") { let word_distance = match try!(self.consume_integer()) { Some(int) => int, None => { @@ -777,18 +816,17 @@ impl<'a, 'c> Parser<'a, 'c> { } match filters.len() { 0 => panic!("Cannot create a DistanceFilter"), - _ => Ok(Box::new(DistanceFilter::new(filters, word_distance as u32))), + _ => { + let filter = Box::new(DistanceFilter::new(filters, word_distance as u32)); + if not_stemmed { + Ok(Some(Box::new(NotFilter::new(&self.snapshot, filter, self.kb.clone())))) + } else { + Ok(Some(filter)) + } + } } - } else if self.consume(">") { - let min = try!(self.consume_range_operator()); - let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), Some(min), None); - Ok(Box::new(filter)) - } else if self.consume("<") { - let max = try!(self.consume_range_operator()); - let filter = RangeFilter::new(&self.snapshot, self.kb.clone(), None, Some(max)); - Ok(Box::new(filter)) } else { - Err(Error::Parse("Expected comparison operator".to_string())) + Ok(None) } } @@ -820,7 +858,7 @@ impl<'a, 'c> Parser<'a, 'c> { let offset = self.offset; if self.consume("!") { if let Some(f) = try!(self.aparens()) { - return Ok(Some(Box::new(NotFilter::new(f, self.kb.arraypath_len())))); + return Ok(Some(Box::new(NotFilter::new(&self.snapshot, f, self.kb.clone())))); } else { self.offset = offset; return Ok(None); diff --git a/src/query.rs b/src/query.rs index 8f27803..95fcc10 100644 --- a/src/query.rs +++ b/src/query.rs @@ -138,6 +138,16 @@ impl DocResult { } } } + + pub fn last_segment_array_index(&self) -> Option<&u64> { + self.arraypath.last() + } + + pub fn increment_first(&mut self, array_depth: usize) { + self.seq += 1; + self.arraypath.clear(); + self.arraypath.resize(array_depth, 0); + } } impl PartialEq for DocResult { From 94bb988dd12698fbc7df2ea2f2a5397c4503f684 Mon Sep 17 00:00:00 2001 From: Damien Katz Date: Wed, 24 May 2017 11:44:07 -0700 Subject: [PATCH 121/122] bump version to 0.5.0 for logical not operator fix --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 41835e1..48b6003 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "noise_search" -version = "0.4.2" +version = "0.5.0" authors = ["Damien Katz ", "Volker Mische "] repository = "https://github.com/pipedown/noise.git" homepage = "http:/noisesearch.org" From f198786b15e80dbd2de7b0139aa9cdaac2c0ecea Mon Sep 17 00:00:00 2001 From: Volker Mische Date: Thu, 25 May 2017 11:13:19 +0200 Subject: [PATCH 122/122] Add workaround to "only logical nots" limitation You cannot have every clause be negated as it's a very resource intensive operation. Query need at least one non-negated clauses. There are two workarounds: - Do a `find {}` and filter out the results in your application - Add a field that will always match and use it in your condition. Example: find {alwaystrue: == true && foo: !~= "bar"} Add this information to the language reference and the repl-tests. Fixes #28. --- query_language_reference.md | 10 +++++++++- repl-tests/not.noise | 40 ++++++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/query_language_reference.md b/query_language_reference.md index 95a7e16..e264efe 100644 --- a/query_language_reference.md +++ b/query_language_reference.md @@ -134,7 +134,7 @@ You can use logical not with parentheses to negate everything enclosed. This exa find {foo: == "bar", !(fab: == "baz" || fab: == "biz")} ``` -You cannot have every clause be negated. Query need at least one non-negated clauses. +You cannot have every clause be negated as it's a very resource intensive operation. Query need at least one non-negated clauses. Illegal: @@ -156,6 +156,14 @@ Illegal: find {foo ~= "waz" && !(foo: ~= "bar" && foo: !~= "baz"}) ``` +Workarounds for this limitation are: + + - Do a `find {}` and filter out the results in your application + - Add a field that will always match and use it in your condition. Example: + ``` + find {alwaystrue: == true && foo: !~= "bar"} + ``` + ### Relevancy Scoring and Boosting Relevancy scoring uses a combination boolean model and Term Frequency/Inverse Document Frequency (TF/IDF) scoring system, very similar to Lucene and Elastic Search. The details of the scoring model is beyond the scope of the document. diff --git a/repl-tests/not.noise b/repl-tests/not.noise index c6c8e06..3569a82 100644 --- a/repl-tests/not.noise +++ b/repl-tests/not.noise @@ -4,29 +4,29 @@ drop target/tests/querytestnot; create target/tests/querytestnot; -add {"_id":"1", "bar": "fox"}; +add {"_id":"1", "alwaystrue": true, "bar": "fox"}; "1" -add {"_id":"2", "bar": "quick fox"}; +add {"_id":"2", "alwaystrue": true, "bar": "quick fox"}; "2" -add {"_id":"3", "bar": "quick brown fox"}; +add {"_id":"3", "alwaystrue": true, "bar": "quick brown fox"}; "3" -add {"_id":"4", "bar": ["fox"]}; +add {"_id":"4", "alwaystrue": true, "bar": ["fox"]}; "4" -add {"_id":"5", "bar": ["quick fox"]}; +add {"_id":"5", "alwaystrue": true, "bar": ["quick fox"]}; "5" -add {"_id":"6", "bar": ["quick brown fox"]}; +add {"_id":"6", "alwaystrue": true, "bar": ["quick brown fox"]}; "6" -add {"_id":"7", "baz": ["fox"]}; +add {"_id":"7", "alwaystrue": true, "baz": ["fox"]}; "7" -add {"_id":"8", "baz": ["quick","fox"]}; +add {"_id":"8", "alwaystrue": true, "baz": ["quick","fox"]}; "8" -add {"_id":"9", "baz": ["quick","brown","fox"]}; +add {"_id":"9", "alwaystrue": true, "baz": ["quick","brown","fox"]}; "9" -add {"_id":"10", "baz": [["quick"],["brown"],["fox"]]}; +add {"_id":"10", "alwaystrue": true, "baz": [["quick"],["brown"],["fox"]]}; "10" -add {"_id":"11", "baz": [["brown"],["fox"]]}; +add {"_id":"11", "alwaystrue": true, "baz": [["brown"],["fox"]]}; "11" -add {"_id":"12", "baz": [["fox"]]}; +add {"_id":"12", "alwaystrue": true, "baz": [["fox"]]}; "12" find {(bar: ~="fox" || bar: ~="brown") && (bar: !~="quick")} @@ -143,6 +143,10 @@ find !{baz: [~="fox"]} return ._id ; Parse error: query cannot be made up of only logical not. Must have at least one match clause not negated. +find {!(bar: ~="quick" || bar: [~="quick"] || baz: [~="quick"] || baz: [[~="quick"]])} +return ._id ; +Parse error: query cannot be made up of only logical not. Must have at least one match clause not negated. + find !{baz: ~="fox"} && !{baz: =="foo"} return ._id ; Parse error: Logical not ("!") is nested inside of another logical not. This is not allowed. @@ -150,3 +154,15 @@ Parse error: Logical not ("!") is nested inside of another logical not. This is find {foo: =="bar"} && !{baz: !~="fox"} return ._id ; Parse error: Logical not ("!") is nested inside of another logical not. This is not allowed. + +# Workaround for unallowable expressions + +find {!(bar: ~="quick" || bar: [~="quick"] || baz: [~="quick"] || baz: [[~="quick"]]), alwaystrue: == true} +return ._id ; +[ +"1", +"4", +"7", +"11", +"12" +]