diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bb8dda6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+*.gem
+.bundle
+.yardoc
+Gemfile.lock
+doc/*
+pkg/*
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..11a115a
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,3 @@
+language: ruby
+rvm:
+  - 1.9.3
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..22019ab
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+
+# Specify your gem's dependencies in scraperwiki-api.gemspec
+gemspec
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..edb6c89
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2012 Open North Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..561afdf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+# Ruby Vector Space Model (VSM) with tf*idf weights
+
+For performance, use [Lucene](http://lucene.apache.org/core/), which implements other information retrieval functions, like [BM 25](http://en.wikipedia.org/wiki/Okapi_BM25).
+
+## Usage
+
+    require 'tf-idf-similarity'
+
+
+## Extras
+
+You can access more term frequency, document frequency, and normalization formulas with:
+
+    require 'tf-idf-similarity/extras/collection'
+    require 'tf-idf-similarity/extras/document'
+
+The default tf*idf formula follows [Lucene](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
+
+## Papers
+
+* [G. Salton and C. Buckley. "Term Weighting Approaches in Automatic Text Retrieval."" Technical Report. Cornell University, Ithaca, NY, USA. 1987.](http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf)
+* [E. Chisholm and T. G. Kolda. "New term weighting formulas for the vector space method in information retrieval." Technical Report Number ORNL-TM-13756. Oak Ridge National Laboratory, Oak Ridge, TN, USA. 1999.](http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf)
+
+## Bugs? Questions?
+
+This gem's main repository is on GitHub: [http://github.com/opennorth/tf-idf-similarity](http://github.com/opennorth/tf-idf-similarity), where your contributions, forks, bug reports, feature requests, and feedback are greatly welcomed.
+
+Copyright (c) 2012 Open North Inc., released under the MIT license
diff --git a/Rakefile b/Rakefile
new file mode 100644
index 0000000..dec9518
--- /dev/null
+++ b/Rakefile
@@ -0,0 +1,16 @@
+require 'bundler'
+Bundler::GemHelper.install_tasks
+
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+
+task :default => :spec
+
+begin
+  require 'yard'
+  YARD::Rake::YardocTask.new
+rescue LoadError
+  task :yard do
+    abort 'YARD is not available. In order to run yard, you must: gem install yard'
+  end
+end
diff --git a/USAGE b/USAGE
new file mode 100644
index 0000000..e0be16c
--- /dev/null
+++ b/USAGE
@@ -0,0 +1 @@
+See README.md for full usage details.
diff --git a/lib/tf-idf-similarity.rb b/lib/tf-idf-similarity.rb
new file mode 100644
index 0000000..9db342a
--- /dev/null
+++ b/lib/tf-idf-similarity.rb
@@ -0,0 +1,7 @@
+$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
+
+module TfIdfSimilarity
+  autoload :Collection, 'tf-idf-similarity/collection'
+  autoload :Document, 'tf-idf-similarity/document'
+  autoload :Token, 'tf-idf-similarity/token'
+end
diff --git a/lib/tf-idf-similarity/collection.rb b/lib/tf-idf-similarity/collection.rb
new file mode 100644
index 0000000..e707d5f
--- /dev/null
+++ b/lib/tf-idf-similarity/collection.rb
@@ -0,0 +1,66 @@
+require 'matrix'
+
+class TfIdfSimilarity::Collection
+  # The documents in the collection.
+  attr_reader :documents
+  # The number of times each term appears in all documents.
+  attr_reader :term_counts
+  # The number of documents each term appears in.
+  attr_reader :document_counts
+
+  def initialize
+    @documents       = []
+    @term_counts     = Hash.new 0
+    @document_counts = Hash.new 0
+  end
+
+  def <<(document)
+    document.term_counts.each do |term,count|
+      @term_counts[term]     += count
+      @document_counts[term] += 1
+    end
+    @documents << document
+  end
+
+  # @return [Array<String>] the set of the collection's terms with no duplicates
+  def terms
+    term_counts.keys
+  end
+
+  # @note Use GSL or Linalg, or a package that implements sparse matrices, if
+  #   Ruby's Matrix performance is too slow.
+  #
+  # @see http://en.wikipedia.org/wiki/Vector_space_model
+  # @see http://en.wikipedia.org/wiki/Document-term_matrix
+  # @see http://en.wikipedia.org/wiki/Cosine_similarity
+  def similarity_matrix
+    idf = []
+
+    term_document_matrix = Matrix.build(terms.size, documents.size) do |i,j|
+      idf[i] ||= inverse_document_frequency terms[i]
+      documents[j].term_frequency(terms[i]) * idf[i]
+    end
+
+    # Columns are normalized to unit vectors, so we can calculate the cosine
+    # similarity of all document vectors.
+    matrix = normalize term_document_matrix
+    matrix.transpose * matrix
+  end
+
+  # @param [String] term a term
+  # @return [Float] the term's inverse document frequency
+  #
+  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
+  def inverse_document_frequency(term)
+    1 + Math.log2 documents.size / (document_counts(term).to_f + 1)
+  end
+  alias_method :idf, :inverse_document_frequency
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] a matrix in which all document vectors are unit vectors
+  #
+  # @note Lucene normalizes document length differently.
+  def normalize(matrix)
+    Matrix.columns tfidf.column_vectors.map(&:normalize)
+  end
+end
diff --git a/lib/tf-idf-similarity/document.rb b/lib/tf-idf-similarity/document.rb
new file mode 100644
index 0000000..469fed8
--- /dev/null
+++ b/lib/tf-idf-similarity/document.rb
@@ -0,0 +1,74 @@
+# coding: utf-8
+
+class TfIdfSimilarity::Document
+  # An optional document identifier.
+  attr_reader :id
+  # The document's text.
+  attr_reader :text
+  # The number of times each term appears in the document.
+  attr_reader :term_counts
+  # The maximum term count of any term in the document.
+  attr_reader :maximum_term_count
+  # The average term count of all terms in the document.
+  attr_reader :average_term_count
+
+  # @param [String] text the document's text
+  # @param [Hash] opts optional arguments
+  # @option opts [String] :id a string to identify the document
+  def initialize(text, opts = {})
+    @text        = text
+    @id          = opts[:id] || object_id
+    @term_counts = Hash.new 0
+    process
+  end
+
+  # @return [Array<String>] the set of the document's terms with no duplicates
+  def terms
+    term_counts.keys
+  end
+  
+  # @param [String] term a term
+  # @return [Integer] the number of times the term appears in the document
+  def term_count(term)
+    term_counts[term]
+  end
+
+  # @param [String] term a term
+  # @return [Float] the square root of the term count
+  #
+  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
+  def term_frequency(term)
+    Math.sqrt term_count(term)
+  end
+  alias_method :tf, :term_frequency
+
+private
+
+  # Tokenize the text and counts terms.
+  def process
+    tokenize(text).each do |word|
+      token = Token.new word
+      if token.valid?
+        @term_counts[token.lowercase_filter.classic_filter.to_s] += 1
+      end
+    end
+
+    @maximum_term_count = @term_counts.values.max.to_f
+    @average_term_count = @term_counts.values.reduce(:+) / @term_counts.size.to_f
+  end
+
+  # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
+  # Word Boundary Specification.
+  #
+  # @param [String] text a text
+  # @return [Enumerator] a token enumerator
+  #
+  # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
+  #   or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
+  #
+  # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
+  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
+  def tokenize(text)
+    UnicodeUtils.each_word text
+  end
+end
diff --git a/lib/tf-idf-similarity/extras/collection.rb b/lib/tf-idf-similarity/extras/collection.rb
new file mode 100644
index 0000000..97f4600
--- /dev/null
+++ b/lib/tf-idf-similarity/extras/collection.rb
@@ -0,0 +1,83 @@
+class TfIdfSimilarity::Collection
+  # @note SMART n, Salton x, Chisholm NONE
+  def no_collection_frequency(term)
+    1.0
+  end
+
+  # @note SMART t, Salton f, Chisholm IDFB
+  def plain_inverse_document_frequency(term)
+    count = document_counts(term).to_f
+    Math.log2 documents.size / count
+  end
+  alias_method :plain_idf, :plain_inverse_document_frequency
+
+  # @note SMART p, Salton p, Chisholm IDFP
+  def probabilistic_inverse_document_frequency(term)
+    count = document_counts(term).to_f
+    Math.log2 (documents.size - count) / count
+  end
+  alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
+
+  # @note Chisholm IGFF
+  def global_frequency_inverse_document_frequency(term)
+    term_counts[term] / document_counts(term).to_f
+  end
+  alias_method :gfidf, :global_frequency_inverse_document_frequency
+
+  # @note Chisholm IGFL
+  def log_global_frequency_inverse_document_frequency(term)
+    Math.log global_frequency_inverse_document_frequency(term) + 1
+  end
+  alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
+
+  # @note Chisholm IGFI
+  def incremented_global_frequency_inverse_document_frequency(term)
+    global_frequency_inverse_document_frequency(term) + 1
+  end
+  alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
+
+  # @note Chisholm IGFS
+  def square_root_global_frequency_inverse_document_frequency(term)
+    Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9
+  end
+  alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
+
+  # @note Chisholm ENPY
+  def entropy(term)
+    denominator = term_counts[term].to_f
+    logN = Math.log2 documents.size
+    1 + documents.reduce(0) do |sum,document|
+      quotient = document.term_count(term) / denominator
+      sum += quotient * Math.log2(quotient) / logN
+    end
+  end
+
+
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] the same matrix
+  #
+  # @note SMART n, Salton x, Chisholm NONE
+  def no_normalization(matrix)
+    matrix
+  end
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] a matrix in which all document vectors are unit vectors
+  #
+  # @note SMART c, Salton c, Chisholm COSN
+  def cosine_normalization(matrix)
+    Matrix.columns(tfidf.column_vectors.map do |column|
+      column.normalize
+    end)
+  end
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] a matrix
+  #
+  # @note SMART u, Chisholm PUQN
+  def pivoted_unique_normalization(matrix)
+    # @todo
+    # http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
+  end
+end
diff --git a/lib/tf-idf-similarity/extras/document.rb b/lib/tf-idf-similarity/extras/document.rb
new file mode 100644
index 0000000..9904fe4
--- /dev/null
+++ b/lib/tf-idf-similarity/extras/document.rb
@@ -0,0 +1,110 @@
+class TfIdfSimilarity::Document
+  # Returns the term count.
+  #
+  # @note SMART n, Salton t, Chisholm FREQ
+  def plain_term_frequency(term)
+    term_count term
+  end
+  alias :plain_tf, :plain_term_frequency
+
+  # Returns 1 if the term is present, 0 otherwise.
+  #
+  # @note SMART b, Salton b, Chisholm BNRY
+  def binary_term_frequency(term)
+    count = term_count term
+    if count > 0
+      1
+    else
+      0
+    end
+  end
+  alias_method :binary_tf, :binary_term_frequency
+
+  # Normalizes the term count by the maximum term count.
+  #
+  # @see http://en.wikipedia.org/wiki/Tf*idf
+  def normalized_term_frequency(term)
+    term_count(term) / maximum_term_count
+  end
+  alias_method :normalized_tf, :normalized_term_frequency
+
+  # Further normalizes the normalized term frequency to lie between 0.5 and 1.
+  #
+  # @note SMART a, Salton n, Chisholm ATF1
+  def augmented_normalized_term_frequency(term)
+    0.5 + 0.5 * normalized_term_frequency(term)
+  end
+  alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency
+
+  # @note Chisholm ATFA 
+  def augmented_average_term_frequency(term)
+    count = term_count term
+    if count > 0
+      0.9 + 0.1 * count / average_term_count
+    else
+      0
+    end
+  end
+  alias_method :augmented_average_tf, :augmented_average_term_frequency
+
+  # @note Chisholm ATFC
+  def changed_coefficient_augmented_normalized_term_frequency(term)
+    count = term_count term
+    if count > 0
+      0.2 + 0.8 * count / maximum_term_count
+    else
+      0
+    end
+  end
+  alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency
+
+  # Dampen the term count using log.
+  #
+  # @note SMART l, Chisholm LOGA
+  def log_term_frequency(term)
+    count = term_count term
+    if count > 0
+      1 + Math.log2(count)
+    else
+      0
+    end
+  end
+  alias_method :log_tf, :log_term_frequency
+
+  # Dampen and normalize the term count by the average term count.
+  #
+  # @note SMART L, Chisholm LOGN
+  def normalized_log_term_frequency(term)
+    count = term_count term
+    if count > 0
+      (1 + Math.log2(count)) / (1 + Math.log2(average_term_count))
+    else
+      0
+    end
+  end
+  alias_method :normalized_log_tf, :normalized_log_term_frequency
+
+  # @note Chisholm LOGG
+  def augmented_log_term_frequency(term)
+    count = term_count term
+    if count > 0
+      0.2 + 0.8 * Math.log(count + 1)
+    else
+      0
+    end
+  end
+  alias_method :augmented_log_tf, :augmented_log_term_frequency
+
+  # Dampen the term count using square root.
+  #
+  # @note Chisholm SQRT
+  def square_root_term_frequency(term)
+    count = term_count term
+    if count > 0
+      Math.sqrt(count - 0.5) + 1
+    else
+      0
+    end
+  end
+  alias_method :square_root_tf, :square_root_term_frequency
+end
diff --git a/lib/tf-idf-similarity/token.rb b/lib/tf-idf-similarity/token.rb
new file mode 100644
index 0000000..6d0ed77
--- /dev/null
+++ b/lib/tf-idf-similarity/token.rb
@@ -0,0 +1,40 @@
+# @note We can add more filters from Solr and stem using Porter's Snowball.
+#
+# @see https://github.com/aurelian/ruby-stemmer
+# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory
+# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
+# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
+class TfIdfSimilarity::Token < String
+  # Returns a falsy value if all its characters are numbers, punctuation,
+  # whitespace or control characters.
+  #
+  # @note Some implementations ignore one and two-letter words.
+  #
+  # @return [Boolean] whether the string is a token
+  def valid?
+    token[%r{
+      \A
+        (
+         \d           | # number
+         \p{Cntrl}    | # control character
+         \p{Punct}    | # punctuation
+         [[:space:]]    # whitespace
+        )+
+      \z
+    }x]
+  end
+
+  # @return [String] a lowercase string
+  #
+  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
+  def lowercase_filter
+    UnicodeUtils.downcase self, :fr
+  end
+
+  # @return [String] a string with no English possessive or periods in acronyms
+  #
+  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
+  def classic_filter
+    self.gsub('.', '').chomp "'s"
+  end
+end
diff --git a/td-idf-similarity.gemspec b/td-idf-similarity.gemspec
new file mode 100644
index 0000000..c8c3232
--- /dev/null
+++ b/td-idf-similarity.gemspec
@@ -0,0 +1,22 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "tf-idf-similarity/version"
+
+Gem::Specification.new do |s|
+  s.name        = "tf-idf-similarity"
+  s.version     = TfIdfSimilarity::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Open North"]
+  s.email       = ["info@opennorth.ca"]
+  s.homepage    = "http://github.com/opennorth/tf-idf-similarity"
+  s.summary     = %q{Implements a Vector Space Model (VSM) with tf*idf weights}
+
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+
+  s.add_runtime_dependency('unicode_utils')
+  s.add_development_dependency('rspec', '~> 2.10')
+  s.add_development_dependency('rake')
+end