Update for tantivy 0.2.0

quickwit-oss · Dec 11, 2016 · 2208b65 · 2208b65
2 parents ab0c8ed + 3510691
commit 2208b65
Show file tree

Hide file tree

Showing 11 changed files with 481 additions and 305 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,37 +1,43 @@
 [package]
 name = "tantivy-cli"
-version = "0.1.1"
+version = "0.2.0"
 authors = ["Paul Masurel <[email protected]>"]
 
 description = """Command line interface for Tantivy, a search engine library."""
-documentation = "https://github.com/fulmicoton/tantivy"
-homepage = "https://github.com/fulmicoton/tantivy"
-repository = "https://github.com/fulmicoton/tantivy"
+documentation = "https://github.com/tantivy-search/tantivy-cli"
+homepage = "https://github.com/tantivy-search/tantivy-cli"
+repository = "https://github.com/tantivy-search/tantivy-cli"
 
 readme = "README.md"
 keywords = ["search", "information", "retrieval"]
 license = "MIT"
 
 [dependencies]
-#tantivy = { path = "../tantivy" }
-tantivy = "0.1.1"
-time = "0.1.34"
+
+time = "0.1"
 iron = "0.4"
-staticfile = "0.3.0"
-rustc-serialize = "0.3.16"
-persistent="0.2.0"
+staticfile = "0.3"
+rustc-serialize = "0.3"
+persistent="0.2"
 clap = "2"
-ansi_term = "0.8.0"
+ansi_term = "0.8"
 urlencoded = "0.4"
-mount = "0.2.1"
+mount = "0.2"
+chan = "0.1"
+bincode = "0.4"
+byteorder = "0.5"
+log = "0.3"
+env_logger = "0.3"
+tantivy = "0.2.0"
 
+[[bin]]
+name = "tantivy"
+path = "src/main.rs"
 
 
-# [dependencies.clap]
-#version = "2"
-#default-features = false
-#features = [ "suggestions", "color" ]
+[profile.release]
+opt-level = 3
+debug = false
+debug-assertions = false
+lto = true
 
-[[bin]]
-name = "tantivy"
-path = "src/main.rs"
diff --git a/README.md b/README.md
@@ -142,7 +142,6 @@ It contains two sections:
 - segments (currently empty, but we will change that soon)
 - schema 
 
-
 
 
 # Indexing the document: `index`
@@ -163,34 +162,39 @@ Make sure to decompress the file
     bunzip2 wiki-articles.json.bz2
 ```
 
-If you are in a rush you can [download 100 articles in the right format here](http://fulmicoton.com/tantivy-files/wiki-articles-1000.json).
+If you are in a rush you can [download 100 articles in the right format here (11 MB)](http://fulmicoton.com/tantivy-files/wiki-articles-1000.json).
 
 The `index` command will index your document.
-By default it will use as many threads as there are cores on your machine.
-You can change the number of threads by passing it the `-t` parameter.
+By default it will use as 3 thread, each with a buffer size of 1GB split a
+accross these threads. 
 
-On my computer (8 core Xeon(R) CPU X3450  @ 2.67GHz), it will take around 6 minutes.
 
 ```
     cat wiki-articles.json | tantivy index -i ./wikipedia-index
 ```
 
-While it is indexing, you can peek at the index directory
-to check what is happening.
+You can change the number of threads by passing it the `-t` parameter, and the total
+buffer size used by the threads heap by using the `-m`. Note that tantivy's memory usage
+is greater than just this buffer size parameter.
+
+On my computer (8 core Xeon(R) CPU X3450  @ 2.67GHz), on 8 threads, indexing wikipedia takes around 9 minutes.
+
+
+While tantivy is indexing, you can peek at the index directory to check what is happening.
 
 ```bash
     ls ./wikipedia-index
 ```
 
-If you indexed the 5 million articles, you should see a lot of new files, all with the following format:
-
 The main file is `meta.json`.
 
+You should also see a lot of files with a UUID as filename, and different extensions.
 Our index is in fact divided in segments. Each segment acts as an individual smaller index.
 Its name is simply a uuid. 
 
-
-
+If you decided to index the complete wikipedia, you may also see some of these files disappear.
+Having too many segments can hurt search performance, so tantivy actually automatically starts
+merging segments. 
 
 # Serve the search index: `serve`
 
@@ -204,26 +208,30 @@ You can run it with the following command.
 By default, it will serve on port `3000`.
 
 You can search for the top 20 most relevant documents for the query `Barack Obama` by accessing
-the following [url](http://localhost:3000/api/?q=barack+obama&explain=true&nhits=20) in your browser
+the following [url](http://localhost:3000/api/?q=barack+obama&nhits=20) in your browser
+
+    http://localhost:3000/api/?q=barack+obama&nhits=20
 
-    http://localhost:3000/api/?q=barack+obama&explain=true&nhits=20
+By default this query is treated as `barack OR obama`.
+You can also search for documents that contains both term, by adding a `+` sign before the terms in your query.
 
+    http://localhost:3000/api/?q=%2Bbarack%20%2Bobama%0A&nhits=20
+
+Also, `-` makes it possible to remove documents the documents containing a specific term.
 
-# Optimizing the index: `merge`
+    http://localhost:3000/api/?q=-barack%20%2Bobama%0A&nhits=20
+
+Finally tantivy handle phrase queries.
 
-Each of tantivy's indexer threads is building its own independant segment.
-When its buffer is full, it closes its running segment, and starts working on a new one.
-You should currently have more than 50 segments in your directory.
+    http://localhost:3000/api/?q=%22barack%20obama%22&nhits=20
+
 
-Having that many segments can hurt your query performance.
-Calling `tantivy merge` will merge your segments into one. 
+# Search the index via the command line
+
+You may also use the `search` command to stream all documents matching a specific query.
+The documents are returned in an unspecified order.
 
 ```
-    tantivy merge -i ./wikipedia-index
+    tantivy search -i wikipedia-index -q "barack obama"
 ```
 
-(The command takes less than 4 minutes on my computer)
-
-Note that your files are still there even after having run the command.
-However, `meta.json` only lists one of the segments.
-You will still need to remove the files manually.
diff --git a/src/commands/bench.rs b/src/commands/bench.rs
@@ -1,7 +1,6 @@
 use tantivy::Index;
 use tantivy::schema::{Field, Schema};
 use tantivy::query::QueryParser;
-use tantivy::query::Query;
 use std::path::Path;
 use tantivy::TimerTree;
 use std::io::BufReader;
@@ -57,7 +56,7 @@ fn run_bench(index_path: &Path,
     println!("-------------------------------\n\n\n");
 
     let index = try!(Index::open(index_path).map_err(|e| format!("Failed to open index.\n{:?}", e)));
-    let searcher = try!(index.searcher().map_err(|e| format!("Failed to acquire searcher.\n{:?}", e)));
+    let searcher = index.searcher();
     let default_search_fields: Vec<Field> = extract_search_fields(&index.schema());
     let queries = try!(read_query_file(query_filepath).map_err(|e| format!("Failed reading the query file:  {}", e)));
     let query_parser = QueryParser::new(index.schema(), default_search_fields);
@@ -67,15 +66,15 @@ fn run_bench(index_path: &Path,
     for _ in 0..num_repeat {
         for query_txt in &queries {
             let query = query_parser.parse_query(&query_txt).unwrap();
-            let num_terms = query.num_terms();
+            // let num_terms = query.num_terms();
             let mut top_collector = TopCollector::with_limit(10);
-            let mut count_collector = CountCollector::new();
+            let mut count_collector = CountCollector::default();
             let timing;
             {
-                let mut collector = chain().add(&mut top_collector).add(&mut count_collector);
+                let mut collector = chain().push(&mut top_collector).push(&mut count_collector);
                 timing = try!(query.search(&searcher, &mut collector).map_err(|e| format!("Failed while searching query {:?}.\n\n{:?}", query_txt, e)));
             }
-            println!("{}\t{}\t{}\t{}", query_txt, num_terms, count_collector.count(), timing.total_time());
+            println!("{}\t{}\t{}", query_txt, count_collector.count(), timing.total_time());
         }
     }
 
@@ -87,7 +86,7 @@ fn run_bench(index_path: &Path,
             let query = query_parser.parse_query(&query_txt).unwrap();
             let mut top_collector = TopCollector::with_limit(10);
             try!(query.search(&searcher, &mut top_collector).map_err(|e| format!("Failed while retrieving document for query {:?}.\n{:?}", query, e)));
-            let mut timer = TimerTree::new();
+            let mut timer = TimerTree::default();
             {
                 let _scoped_timer_ = timer.open("total");
                 for doc_address in top_collector.docs() {

diff --git a/src/commands/index.rs b/src/commands/index.rs
@@ -1,15 +1,19 @@
 use std::convert::From;
 use std::fs::File;
 use std::io;
+use std::cmp;
 use std::io::BufRead;
 use std::io::BufReader;
 use std::io::Read;
 use std::path::PathBuf;
-use tantivy; 
+use tantivy;
 use tantivy::Index;
+use tantivy::IndexWriter;
+use tantivy::Document;
 use time::PreciseTime;
 use clap::ArgMatches;
-
+use chan;
+use std::thread;
 
 pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> {
     let index_directory = PathBuf::from(argmatch.value_of("index").unwrap());
@@ -21,81 +25,113 @@ pub fn run_index_cli(argmatch: &ArgMatches) -> Result<(), String> {
             None => DocumentSource::FromPipe,
         }
     };
-    let num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer.")));
-    run_index(index_directory, document_source, num_threads).map_err(|e| format!("Indexing failed : {:?}", e))    
-}
-
-enum DocumentSource {
-    FromPipe,
-    FromFile(PathBuf),
+    let mut num_threads = try!(value_t!(argmatch, "num_threads", usize).map_err(|_|format!("Failed to read num_threads argument as an integer.")));
+    if num_threads == 0 {
+        num_threads = 1;
+    }
+    let buffer_size = try!(value_t!(argmatch, "memory_size", usize).map_err(|_|format!("Failed to read the buffer size argument as an integer.")));
+    let buffer_size_per_thread = buffer_size / num_threads;
+    run_index(index_directory, document_source, buffer_size_per_thread, num_threads).map_err(|e| format!("Indexing failed : {:?}", e))
 }
 
-fn run_index(directory: PathBuf, document_source: DocumentSource, num_threads: usize) -> tantivy::Result<()> {
+fn run_index(directory: PathBuf, document_source: DocumentSource, buffer_size_per_thread: usize, num_threads: usize) -> tantivy::Result<()> {
 
     let index = try!(Index::open(&directory));
-
     let schema = index.schema();
+    let (line_sender, line_receiver) = chan::sync(10_000);
+    let (doc_sender, doc_receiver) = chan::sync(10_000);
+
+    thread::spawn(move || {
+        let articles = document_source.read().unwrap();
+        for article_line_res in articles.lines() {
+            let article_line = article_line_res.unwrap();
+            line_sender.send(article_line);
+        }
+    });
 
-    let mut index_writer = try!( 
+
+    let num_threads_to_parse_json = cmp::max(1, num_threads / 2);
+    info!("Using {} threads to parse json", num_threads_to_parse_json);
+    for _ in 0..num_threads_to_parse_json {
+        let schema_clone = schema.clone();
+        let doc_sender_clone = doc_sender.clone();
+        let line_receiver_clone = line_receiver.clone();
+        thread::spawn(move || {
+            for article_line in line_receiver_clone {
+                match schema_clone.parse_document(&article_line) {
+                    Ok(doc) => {
+                        doc_sender_clone.send(doc);
+                    }
+                    Err(err) => {
+                        println!("Failed to add document doc {:?}", err);
+                    }
+                }
+            }
+        });
+    }
+    drop(doc_sender);
+
+    let mut index_writer = try!(
         if num_threads > 0 {
-            index.writer_with_num_threads(num_threads)
+            index.writer_with_num_threads(num_threads, buffer_size_per_thread)
         }
         else {
-            index.writer()
+            index.writer(buffer_size_per_thread)
         }
     );
+
+
+    let index_result = index_documents(&mut index_writer, doc_receiver);
+    try!(match index_result {
+        Ok(docstamp) => {
+            println!("Commit succeed, docstamp at {}", docstamp);
+            Ok(())
+        }
+        Err(e) => {
+            println!("Error during indexing, rollbacking.");
+            index_writer.rollback().unwrap();
+            println!("Rollback succeeded");
+            Err(e)
+        }
+    });
 
-    let articles = try!(document_source.read());
-
+    index_writer.wait_merging_threads()
+}
+
+fn index_documents(index_writer: &mut IndexWriter, doc_receiver: chan::Receiver<Document>) -> tantivy::Result<u64> {
+    let group_count = 100_000;
     let mut num_docs = 0;
     let mut cur = PreciseTime::now();
-    let group_count = 100000;
-
-    for article_line_res in articles.lines() {
-        let article_line = article_line_res.unwrap(); // TODO
-        match schema.parse_document(&article_line) {
-            Ok(doc) => {
-                index_writer.add_document(doc).unwrap();
-            }
-            Err(err) => {
-                println!("Failed to add document doc {:?}", err);
-            }
-        }
+    for doc in doc_receiver {
+        try!(index_writer.add_document(doc));
         if num_docs > 0 && (num_docs % group_count == 0) {
             println!("{} Docs", num_docs);
             let new = PreciseTime::now();
             let elapsed = cur.to(new);
             println!("{:?} docs / hour", group_count * 3600 * 1_000_000 as u64 / (elapsed.num_microseconds().unwrap() as u64));
             cur = new;
         }
-
         num_docs += 1;
-
     }
-    index_writer.wait().unwrap(); // TODO
-    Ok(())
+    index_writer.commit()
 }
 
 
-#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
-pub struct WikiArticle {
-    pub url: String,
-    pub title: String,
-    pub body: String,
+enum DocumentSource {
+    FromPipe,
+    FromFile(PathBuf),
 }
 
-
 impl DocumentSource {
     fn read(&self,) -> io::Result<BufReader<Box<Read>>> {
         Ok(match self {
             &DocumentSource::FromPipe => {
                 BufReader::new(Box::new(io::stdin()))
-            } 
+            }
             &DocumentSource::FromFile(ref filepath) => {
                 let read_file = try!(File::open(&filepath));
                 BufReader::new(Box::new(read_file))
             }
         })
     }
 }
-