Merge pull request #97 from JuliaText/as/towards07

Prepare for 1.0
JuliaText · Oct 1, 2018 · e835044 · e835044
2 parents 58057e9 + dbf5bed
commit e835044
Show file tree

Hide file tree

Showing 21 changed files with 143 additions and 136 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,11 +2,9 @@ language: julia
 os:
     - linux
 julia:
-    - 0.6
+    - 0.7
+    - 1.0
 notifications:
     email: false
-script:
-    - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
-    - julia -e 'Pkg.clone(pwd()); Pkg.build("TextAnalysis"); Pkg.test("TextAnalysis"; coverage=true)';
 after_success:
     - julia -e 'cd(Pkg.dir("TextAnalysis")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
diff --git a/REQUIRE b/REQUIRE
@@ -1,6 +1,6 @@
-julia 0.6
+julia 0.7
 BinaryProvider
-Languages 0.2.0
+Languages 0.4.0
 DataFrames
 WordTokenizers
 Flux

diff --git a/appveyor.yml b/appveyor.yml
@@ -1,9 +1,18 @@
 environment:
   matrix:
-  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
-  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
-#  - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
-#  - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
+  - julia_version: 0.7
+  - julia_version: 1
+  - julia_version: nightly
+
+platform:
+  - x86 # 32-bit
+  - x64 # 64-bit
+
+# # Uncomment the following lines to allow failures on nightly julia
+# # (tests will run but not make your overall status red)
+# matrix:
+allow_failures:
+  - julia_version: nightly
 
 branches:
   only:
@@ -17,24 +26,18 @@ notifications:
     on_build_status_changed: false
 
 install:
-  - ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12"
-# If there's a newer build queued for the same PR, cancel this one
-  - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
-        https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
-        Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
-        throw "There are newer queued builds for this pull request, failing early." }
-# Download most recent Julia Windows binary
-  - ps: (new-object net.webclient).DownloadFile(
-        $env:JULIA_URL,
-        "C:\projects\julia-binary.exe")
-# Run installer silently, output to C:\projects\julia
-  - C:\projects\julia-binary.exe /S /D=C:\projects\julia
+  - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1"))
 
 build_script:
-# Need to convert from shallow to complete for Pkg.clone to work
-  - IF EXIST .git\shallow (git fetch --unshallow)
-  - C:\projects\julia\bin\julia -e "versioninfo();
-      Pkg.clone(pwd(), \"TextAnalysis\"); Pkg.build(\"TextAnalysis\")"
+  - echo "%JL_BUILD_SCRIPT%"
+  - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%"
 
 test_script:
-  - C:\projects\julia\bin\julia -e "Pkg.test(\"TextAnalysis\")"
+  - echo "%JL_TEST_SCRIPT%"
+  - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%"
+
+# # Uncomment to support code coverage upload. Should only be enabled for packages
+# # which would have coverage gaps without running on Windows
+# on_success:
+#   - echo "%JL_CODECOV_SCRIPT%"
+#   - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%"
diff --git a/docs/push-gh-pages.jl b/docs/push-gh-pages.jl
@@ -4,9 +4,9 @@
 
 last_commit=readchomp(`git --no-pager log -1 --pretty=format:"%h:%s"`)
 
-ENV["GIT_DIR"]=abspath(chomp(readstring(`git rev-parse --git-dir`)))
+ENV["GIT_DIR"]=abspath(chomp(read(`git rev-parse --git-dir`, String)))
 
-old_sha = chomp(readstring(`git rev-parse refs/remotes/origin/gh-pages`))
+old_sha = chomp(read(`git rev-parse refs/remotes/origin/gh-pages`, String))
 
 #run(`julia make.jl`)
 
@@ -16,13 +16,13 @@ cd("build") do
     ENV["GIT_INDEX_FILE"]=gif
     ENV["GIT_WORK_TREE"]=pwd()
     run(`git add -A`)
-    tsha=chomp(readstring(`git write-tree`))
+    tsha=chomp(read(`git write-tree`, String))
     mesg="Deploy docs for master@$last_commit" 
 
     if length(old_sha) == 40
-        csha = chomp(readstring(`git commit-tree $tsha -p $old_sha -m $(mesg)`))
+        csha = chomp(read(`git commit-tree $tsha -p $old_sha -m $(mesg)`, String))
     else 
-        csha = chomp(readstring(`git commit-tree $tsha -m $(mesg)`))
+        csha = chomp(read(`git commit-tree $tsha -m $(mesg)`, String))
     end
 
      print("Created commit $csha")

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -1,6 +1,10 @@
 using DataFrames
 
 module TextAnalysis
+    using SparseArrays
+    using Printf
+    using LinearAlgebra
+
     using Languages
     using DataFrames
     using WordTokenizers

diff --git a/src/corpus.jl b/src/corpus.jl
@@ -45,7 +45,7 @@ function DirectoryCorpus(dirname::AbstractString)
 
         cd(dirname)
         for filename in readdir(".")
-            if isfile(filename) && !ismatch(r"^\.", filename)
+            if isfile(filename) && !occursin(r"^\.", filename)
                 push!(docs, FileDocument(abspath(filename)))
             end
             if isdir(filename) && !islink(filename)
@@ -102,9 +102,10 @@ end
 #
 ##############################################################################
 
-Base.start(crps::Corpus) = 1
-Base.next(crps::Corpus, ind::Int) = (crps.documents[ind], ind + 1)
-Base.done(crps::Corpus, ind::Int) = ind > length(crps.documents)
+function Base.iterate(crps::Corpus, ind=1)
+    ind > length(crps.documents) && return nothing
+    crps.documents[ind], ind+1
+end
 
 ##############################################################################
 #
@@ -115,8 +116,8 @@ Base.done(crps::Corpus, ind::Int) = ind > length(crps.documents)
 Base.push!(crps::Corpus, d::AbstractDocument) = push!(crps.documents, d)
 Base.pop!(crps::Corpus) = pop!(crps.documents)
 
-Base.unshift!(crps::Corpus, d::AbstractDocument) = unshift!(crps.documents, d)
-Base.shift!(crps::Corpus) = shift!(crps.documents)
+Base.pushfirst!(crps::Corpus, d::AbstractDocument) = pushfirst!(crps.documents, d)
+Base.popfirst!(crps::Corpus) = popfirst!(crps.documents)
 
 function Base.insert!(crps::Corpus, index::Int, d::AbstractDocument)
     insert!(crps.documents, index, d)
@@ -133,8 +134,8 @@ Base.delete!(crps::Corpus, index::Integer) = delete!(crps.documents, index)
 ##############################################################################
 
 Base.getindex(crps::Corpus, ind::Real) = crps.documents[ind]
-Base.getindex{T <: Real}(crps::Corpus, inds::Vector{T}) = crps.documents[inds]
-Base.getindex(crps::Corpus, r::Range) = crps.documents[r]
+Base.getindex(crps::Corpus, inds::Vector{T}) where {T <: Real} = crps.documents[inds]
+Base.getindex(crps::Corpus, r::AbstractRange) = crps.documents[r]
 Base.getindex(crps::Corpus, term::AbstractString) = get(crps.inverse_index, term, Int[])
 
 ##############################################################################
@@ -226,7 +227,7 @@ hash_function!(crps::Corpus, f::TextHashFunction) = (crps.h = f; nothing)
 #
 ##############################################################################
 
-function standardize!{T <: AbstractDocument}(crps::Corpus, ::Type{T})
+function standardize!(crps::Corpus, ::Type{T}) where T <: AbstractDocument
     for i in 1:length(crps)
         crps.documents[i] = convert(T, crps.documents[i])
     end

diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -1,22 +1,22 @@
 
 ## Deprecations for Languages
 
-function tokenize{S <: Language, T <: AbstractString}(::Type{S}, s::T)
+function tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString}
     depwarn("Use of Languages as types is deprecated. Use instances.",  Symbol(S))
     tokenize(S(), s)
 end
 
-function ngramize{S <: Language, T <: AbstractString}(::Type{S}, words::Vector{T}, n::Int)
+function ngramize(::Type{S}, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString}
     depwarn("Use of Languages as types is deprecated. Use instances.",  Symbol(S))
     ngramize(S(), words, n)
 end
 
-function onegramize{S <: Language, T <: AbstractString}(::Type{S}, words::Vector{T})
+function onegramize(::Type{S}, words::Vector{T}) where {S <: Language, T <: AbstractString}
     depwarn("Use of Languages as types is deprecated. Use instances.",  Symbol(S))
     onegramize(S(), words)
 end
 
-function stem_all{S <: Language}(stemmer::Stemmer, lang::Type{S}, sentence::AbstractString)
+function stem_all(stemmer::Stemmer, lang::Type{S}, sentence::AbstractString) where S <: Language
     depwarn("Use of Languages as types is deprecated. Use instances.",  Symbol(S))
     stem_all(stemmer, S(), sentence)
 end

diff --git a/src/document.jl b/src/document.jl
@@ -4,7 +4,7 @@
 #
 ##############################################################################
 
-type DocumentMetadata
+mutable struct DocumentMetadata
     language
     name::String
     author::String
@@ -31,7 +31,7 @@ abstract type AbstractDocument; end
 #
 ##############################################################################
 
-type FileDocument <: AbstractDocument
+mutable struct FileDocument <: AbstractDocument
     filename::String
     metadata::DocumentMetadata
 end
@@ -48,7 +48,7 @@ end
 #
 ##############################################################################
 
-type StringDocument{T<:AbstractString} <: AbstractDocument
+mutable struct StringDocument{T<:AbstractString} <: AbstractDocument
     text::T
     metadata::DocumentMetadata
 end
@@ -61,14 +61,14 @@ StringDocument(txt::AbstractString) = StringDocument(txt, DocumentMetadata())
 #
 ##############################################################################
 
-type TokenDocument{T<:AbstractString} <: AbstractDocument
+mutable struct TokenDocument{T<:AbstractString} <: AbstractDocument
     tokens::Vector{T}
     metadata::DocumentMetadata
 end
 function TokenDocument(txt::AbstractString, dm::DocumentMetadata)
     TokenDocument(tokenize(dm.language, String(txt)), dm)
 end
-function TokenDocument{T <: AbstractString}(tkns::Vector{T})
+function TokenDocument(tkns::Vector{T}) where T <: AbstractString
     TokenDocument(tkns, DocumentMetadata())
 end
 TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata())
@@ -79,7 +79,7 @@ TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata
 #
 ##############################################################################
 
-type NGramDocument{T<:AbstractString} <: AbstractDocument
+mutable struct NGramDocument{T<:AbstractString} <: AbstractDocument
     ngrams::Dict{T,Int}
     n::Int
     metadata::DocumentMetadata
@@ -91,7 +91,7 @@ end
 function NGramDocument(txt::AbstractString, n::Integer=1)
     NGramDocument(txt, DocumentMetadata(), n)
 end
-function NGramDocument{T <: AbstractString}(ng::Dict{T, Int}, n::Integer=1)
+function NGramDocument(ng::Dict{T, Int}, n::Integer=1) where T <: AbstractString
     NGramDocument(merge(Dict{AbstractString,Int}(), ng), n, DocumentMetadata())
 end
 
@@ -103,12 +103,12 @@ end
 
 function text(fd::FileDocument)
     !isfile(fd.filename) && error("Can't find file: $(fd.filename)")
-    readstring(fd.filename)
+    read(fd.filename, String)
 end
 
 text(sd::StringDocument) = sd.text
 function text(td::TokenDocument)
-    warn("TokenDocument's can only approximate the original text")
+    @warn("TokenDocument's can only approximate the original text")
     join(td.tokens, " ")
 end
 function text(ngd::NGramDocument)
@@ -132,8 +132,8 @@ function tokens(d::NGramDocument)
     error("The tokens of an NGramDocument cannot be reconstructed")
 end
 
-tokens!{T <: AbstractString}(d::TokenDocument, new_tokens::Vector{T}) = (d.tokens = new_tokens)
-function tokens!{T <: AbstractString}(d::AbstractDocument, new_tokens::Vector{T})
+tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens)
+function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString
     error("The tokens of a $(typeof(d)) cannot be directly edited")
 end
 
@@ -199,7 +199,7 @@ const GenericDocument = Union{
 ##############################################################################
 
 Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str)
-Document{T <: AbstractString}(tkns::Vector{T}) = TokenDocument(tkns)
+Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns)
 Document(ng::Dict{String, Int}) = NGramDocument(ng)
 
 ##############################################################################

diff --git a/src/dtm.jl b/src/dtm.jl
@@ -4,7 +4,7 @@
 #
 ##############################################################################
 
-type DocumentTermMatrix
+mutable struct DocumentTermMatrix
     dtm::SparseMatrixCSC{Int, Int}
     terms::Vector{String}
     column_indices::Dict{String, Int}
@@ -32,9 +32,9 @@ function DocumentTermMatrix(crps::Corpus, terms::Vector{String})
     m = length(crps)
     n = length(terms)
 
-    rows = Array{Int}(0)
-    columns = Array{Int}(0)
-    values = Array{Int}(0)
+    rows = Array{Int}(undef, 0)
+    columns = Array{Int}(undef, 0)
+    values = Array{Int}(undef, 0)
     for i in 1:m
         doc = crps.documents[i]
         ngs = ngrams(doc)
@@ -57,7 +57,7 @@ function DocumentTermMatrix(crps::Corpus, terms::Vector{String})
 end
 DocumentTermMatrix(crps::Corpus) = DocumentTermMatrix(crps, lexicon(crps))
 
-DocumentTermMatrix(crps::Corpus, lex::Associative) = DocumentTermMatrix(crps, sort(collect(keys(lex))))
+DocumentTermMatrix(crps::Corpus, lex::AbstractDict) = DocumentTermMatrix(crps, sort(collect(keys(lex))))
 
 DocumentTermMatrix(dtm::SparseMatrixCSC{Int, Int},terms::Vector{String}) = DocumentTermMatrix(dtm, terms, columnindices(terms))
 
@@ -71,7 +71,7 @@ function dtm(d::DocumentTermMatrix, density::Symbol)
     if density == :sparse
         return d.dtm
     else
-        return full(d.dtm)
+        return Matrix(d.dtm)
     end
 end
 
@@ -99,8 +99,8 @@ tdm(crps::Corpus) = dtm(crps)' #'
 
 function dtm_entries(d::AbstractDocument, lex::Dict{String, Int})
     ngs = ngrams(d)
-    indices = Array{Int}(0)
-    values = Array{Int}(0)
+    indices = Array{Int}(undef, 0)
+    values = Array{Int}(undef, 0)
     terms = sort(collect(keys(lex)))
     column_indices = columnindices(terms)
 
@@ -166,7 +166,7 @@ hash_tdm(crps::Corpus) = hash_dtm(crps)' #'
 #
 ##############################################################################
 
-type EachDTV
+mutable struct EachDTV
     crps::Corpus
 end
 
@@ -178,7 +178,7 @@ end
 
 done(edt::EachDTV, state::Int) = state > length(edt.crps.documents)
 
-type EachHashDTV
+mutable struct EachHashDTV
     crps::Corpus
 end
 

diff --git a/src/hash.jl b/src/hash.jl
@@ -18,7 +18,7 @@
 #
 ##############################################################################
 
-type TextHashFunction
+mutable struct TextHashFunction
     hash_function::Function
     cardinality::Int
 end