From dc6dbed1e74ddf49d65be886c13e113b0292d1a1 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Mon, 9 Nov 2020 14:21:33 -0500 Subject: [PATCH 1/2] Add .drone.yml for ARM testing --- .drone.yml | 41 +++++++++++++++++++++++++++++++++++++++++ test/bench.jl | 8 ++++---- 2 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 .drone.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..b2771d5 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,41 @@ +--- +kind: pipeline +name: linux - arm - Julia 1.0 + +platform: + os: linux + arch: arm + +steps: +- name: build + image: julia:1.0 + commands: + - "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'" + +--- +kind: pipeline +name: linux - arm64 - Julia 1.0 + +platform: + os: linux + arch: arm64 + +steps: +- name: build + image: julia:1.0 + commands: + - "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'" + +--- +kind: pipeline +name: linux - arm64 - Julia 1.5 + +platform: + os: linux + arch: arm64 + +steps: +- name: build + image: julia:1.5 + commands: + - "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'" diff --git a/test/bench.jl b/test/bench.jl index e5240f9..b903355 100644 --- a/test/bench.jl +++ b/test/bench.jl @@ -194,7 +194,7 @@ function display_results(io, xres) # pwc(:yellow, io, f"\%10.3f(rn[3]/numchars)") end -function dispres(io, xres) +function dispres(io, xres, skip_unistr=true) # (fname, stats, sizes, res) (fname, stats, sizes, selstat, selsiz, res) = xres show(io, (fname, stats)) @@ -230,7 +230,7 @@ function dispres(io, xres) end for i = 2:length(res) rn = res[i] - #rn[1] == "UniStr" && continue + skip_unistr && rn[1] == "UniStr" && continue pr"\(io)\n\%-12.12s(rn[1])\%6.3f(sizes[i]/stats.len)" tn = rn[3] minres = min(length(t1), length(tn)) @@ -246,14 +246,14 @@ end const divline = string(repeat('#', 100),'\n','\f') -function dispbench(io, totres) +function dispbench(io, totres; kwargs...) for res in totres[1] dispres(io, res) print(io, divline) end end -dispbench(totres) = dispbench(_stdout(), totres) +dispbench(totres; kwargs...) = dispbench(_stdout(), totres; kwargs...) function display_benchmark(io, totres) for res in totres[1] From b46a6184f0ad5ab4f751584e0e48ecfcb4de24e2 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Mon, 17 May 2021 10:32:27 -0400 Subject: [PATCH 2/2] Improve handling of Project Gutenberg books --- .drone.yml | 16 ++-- .github/workflows/ci.yml | 43 +++++++++++ .travis.yml | 35 --------- Project.toml | 34 ++++---- README.md | 41 +--------- test/bench.jl | 134 ++++---------------------------- test/books.jl | 162 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 244 insertions(+), 221 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .travis.yml create mode 100644 test/books.jl diff --git a/.drone.yml b/.drone.yml index b2771d5..7fc521a 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,34 +1,34 @@ --- kind: pipeline -name: linux - arm - Julia 1.0 +name: linux - arm64 - Julia 1.5 platform: os: linux - arch: arm + arch: arm64 steps: - name: build - image: julia:1.0 + image: julia:1.5 commands: - "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'" --- kind: pipeline -name: linux - arm64 - Julia 1.0 +name: linux - arm - Julia 1.6 platform: os: linux - arch: arm64 + arch: arm steps: - name: build - image: julia:1.0 + image: julia:1.6 commands: - "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'" --- kind: pipeline -name: linux - arm64 - Julia 1.5 +name: linux - arm64 - Julia 1.6 platform: os: linux @@ -36,6 +36,6 @@ platform: steps: - name: build - image: julia:1.5 + image: julia:1.6 commands: - "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..36c5c31 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,43 @@ +name: CI +on: + - push + - pull_request +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.5' + - '1.6' + - 'nightly' + os: + - ubuntu-latest + - macOS-latest + - windows-latest + arch: + - x64 + - x86 + exclude: + - os: macOS-latest + arch: x86 + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: actions/cache@v1 + env: + cache-name: cache-artifacts + with: + path: ~/.julia/artifacts + key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} + restore-keys: | + ${{ runner.os }}-test-${{ env.cache-name }}- + ${{ runner.os }}-test- + ${{ runner.os }}- + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 1a72166..0000000 --- a/.travis.yml +++ /dev/null @@ -1,35 +0,0 @@ -## Documentation: http://docs.travis-ci.com/user/languages/julia/ -language: julia -os: - - linux - - osx - - windows -julia: - - 1.0 - - 1 - - nightly -notifications: - email: false -git: - depth: 99999999 - -## uncomment the following lines to allow failures on nightly julia -## (tests will run but not make your overall status red) -matrix: - allow_failures: - - julia: nightly - -## uncomment and modify the following lines to manually install system packages -#addons: -# apt: # apt-get for linux -# packages: -# - gfortran -#before_script: # homebrew for mac -# - if [ $TRAVIS_OS_NAME = osx ]; then brew install gcc; fi - -## uncomment the following lines to override the default test script -#script: - -after_success: - # push coverage results to Codecov - - julia -e 'using Pkg; cd(Pkg.dir("Strs")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())' diff --git a/Project.toml b/Project.toml index c454753..47a0025 100644 --- a/Project.toml +++ b/Project.toml @@ -4,7 +4,7 @@ authors = ["ScottPJones "] keywords = ["Strings", "Characters", "Formatting", "Intern", "Unicode", "Regex"] license = "MIT" uuid = "7bddbee9-b4ee-5d4f-bf0b-c84b4398bbf6" -version = "1.0.3" +version = "1.1.0" [deps] ModuleInterfaceTools = "5cb8414e-7aab-5a03-a681-351269c074bf" @@ -41,21 +41,21 @@ Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" test = ["Test", "REPL", "BenchmarkTools", "Serialization"] [compat] -julia = "^1.0.0" -PCRE2 = "1.0.2" -Format = "1.1.0" -ModuleInterfaceTools = "1.0.1" +julia = "^1.5" +PCRE2 = "^1.0.2" +Format = "^1.3.1" +ModuleInterfaceTools = "^1.0.1" InternedStrings = "0.7.0" -StrAPI = "1" -StrBase = "1.0.4" +StrAPI = "^1.1" +StrBase = "^1.0.5" StrTables = "1" -StrEntities = "1" -StrFormat = "1" -StrRegex = "1.0.1" -StrLiterals = "1" -ChrBase = "1.0.1" -Emoji_Entities = "1" -HTML_Entities = "1" -LaTeX_Entities = "1" -Unicode_Entities = "1" -MurmurHash3 = "^1.0.3" +StrEntities = "^1.0.1" +StrFormat = "^1.0.1" +StrRegex = "^1.1.1" +StrLiterals = "^1.1" +ChrBase = "^1.0.3" +MurmurHash3 = "^1.2" +Emoji_Entities = "^1.0.3" +HTML_Entities = "^1.0.1" +LaTeX_Entities = "^1.0.2" +Unicode_Entities = "^1.1.1" diff --git a/README.md b/README.md index ad28111..4b50b9b 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ [pkg-url]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/report.html [strs-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/Strs.svg [contrib]: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat -[travis-url]: https://travis-ci.org/JuliaString/Strs.jl -[travis-img]: https://travis-ci.org/JuliaString/Strs.jl.svg [codecov-url]: https://codecov.io/gh/JuliaString/Strs.jl [codecov-img]: https://codecov.io/gh/JuliaString/Strs.jl/branch/master/graph/badge.svg @@ -12,7 +10,6 @@ [![contributions welcome][contrib]](https://github.com/JuliaString/Strs.jl/issues) [![][strs-pkg]][pkg-url] -[![][travis-img]][travis-url] [![][codecov-img]][codecov-url] Strs.jl is a container for a number of different packages from [JuliaString.org](https://juliastring.org) @@ -96,132 +93,96 @@ or pointers to such (such as a way to get lots of tweets, to test mixed text and [mit-loc]: https://github.com/JuliaString/ModuleInterfaceTools.jl [mit-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/M/ModuleInterfaceTools.svg -[mit-tvs-img]: https://travis-ci.org/JuliaString/ModuleInterfaceTools.jl.svg?branch=master -[mit-tvs-url]: https://travis-ci.org/JuliaString/ModuleInterfaceTools.jl [mit-rel]: https://img.shields.io/github/release/JuliaString/ModuleInterfaceTools.jl.svg?label="." [mit-dat]: https://img.shields.io/github/release-date/JuliaString/ModuleInterfaceTools.jl.svg?label="." [mh3-loc]: https://github.com/JuliaString/MurmurHash3.jl [mh3-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/M/MurmurHash3.svg -[mh3-tvs-img]: https://travis-ci.org/JuliaString/MurmurHash3.jl.svg?branch=master -[mh3-tvs-url]: https://travis-ci.org/JuliaString/MurmurHash3.jl [mh3-rel]: https://img.shields.io/github/release/JuliaString/MurmurHash3.jl.svg?label="." [mh3-dat]: https://img.shields.io/github/release-date/JuliaString/MurmurHash3.jl.svg?label="." [pcre2-loc]: https://github.com/JuliaString/PCRE2.jl [pcre2-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/P/PCRE2.svg -[pcre2-tvs-img]: https://travis-ci.org/JuliaString/PCRE2.jl.svg?branch=master -[pcre2-tvs-url]: https://travis-ci.org/JuliaString/PCRE2.jl [pcre2-rel]: https://img.shields.io/github/release/JuliaString/PCRE2.jl.svg?label="." [pcre2-dat]: https://img.shields.io/github/release-date/JuliaString/PCRE2.jl.svg?label="." [format-loc]: https://github.com/JuliaString/Format.jl [format-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/F/Format.svg -[format-tvs-img]: https://travis-ci.org/JuliaString/Format.jl.svg?branch=master -[format-tvs-url]: https://travis-ci.org/JuliaString/Format.jl [format-rel]: https://img.shields.io/github/release/JuliaString/Format.jl.svg?label="." [format-dat]: https://img.shields.io/github/release-date/JuliaString/Format.jl.svg?label="." [strapi-loc]: https://github.com/JuliaString/StrAPI.jl [strapi-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/StrAPI.svg -[strapi-tvs-img]: https://travis-ci.org/JuliaString/StrAPI.jl.svg?branch=master -[strapi-tvs-url]: https://travis-ci.org/JuliaString/StrAPI.jl [strapi-rel]: https://img.shields.io/github/release/JuliaString/StrAPI.jl.svg?label="." [strapi-dat]: https://img.shields.io/github/release-date/JuliaString/StrAPI.jl.svg?label="." [cse-loc]: https://github.com/JuliaString/CharSetEncodings.jl [cse-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/C/CharSetEncodings.svg -[cse-tvs-img]: https://travis-ci.org/JuliaString/CharSetEncodings.jl.svg?branch=master -[cse-tvs-url]: https://travis-ci.org/JuliaString/CharSetEncodings.jl [cse-rel]: https://img.shields.io/github/release/JuliaString/CharSetEncodings.jl.svg?label="." [cse-dat]: https://img.shields.io/github/release-date/JuliaString/CharSetEncodings.jl.svg?label="." [chrbase-loc]: https://github.com/JuliaString/ChrBase.jl [chrbase-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/C/ChrBase.svg -[chrbase-tvs-img]: https://travis-ci.org/JuliaString/ChrBase.jl.svg?branch=master -[chrbase-tvs-url]: https://travis-ci.org/JuliaString/ChrBase.jl [chrbase-rel]: https://img.shields.io/github/release/JuliaString/ChrBase.jl.svg?label="." [chrbase-dat]: https://img.shields.io/github/release-date/JuliaString/ChrBase.jl.svg?label="." [strbase-loc]: https://github.com/JuliaString/StrBase.jl [strbase-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/StrBase.svg -[strbase-tvs-img]: https://travis-ci.org/JuliaString/StrBase.jl.svg?branch=master -[strbase-tvs-url]: https://travis-ci.org/JuliaString/StrBase.jl [strbase-rel]: https://img.shields.io/github/release/JuliaString/StrBase.jl.svg?label="." [strbase-dat]: https://img.shields.io/github/release-date/JuliaString/StrBase.jl.svg?label="." [strregex-loc]: https://github.com/JuliaString/StrRegex.jl [strregex-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/StrRegex.svg -[strregex-tvs-img]: https://travis-ci.org/JuliaString/StrRegex.jl.svg?branch=master -[strregex-tvs-url]: https://travis-ci.org/JuliaString/StrRegex.jl [strregex-rel]: https://img.shields.io/github/release/JuliaString/StrRegex.jl.svg?label="." [strregex-dat]: https://img.shields.io/github/release-date/JuliaString/StrRegex.jl.svg?label="." [strliterals-loc]: https://github.com/JuliaString/StrLiterals.jl [strliterals-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/StrLiterals.svg -[strliterals-tvs-img]: https://travis-ci.org/JuliaString/StrLiterals.jl.svg?branch=master -[strliterals-tvs-url]: https://travis-ci.org/JuliaString/StrLiterals.jl [strliterals-rel]: https://img.shields.io/github/release/JuliaString/StrLiterals.jl.svg?label="." [strliterals-dat]: https://img.shields.io/github/release-date/JuliaString/StrLiterals.jl.svg?label="." [strformat-loc]: https://github.com/JuliaString/StrFormat.jl [strformat-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/StrFormat.svg -[strformat-tvs-img]: https://travis-ci.org/JuliaString/StrFormat.jl.svg?branch=master -[strformat-tvs-url]: https://travis-ci.org/JuliaString/StrFormat.jl [strformat-rel]: https://img.shields.io/github/release/JuliaString/StrFormat.jl.svg?label="." [strformat-dat]: https://img.shields.io/github/release-date/JuliaString/StrFormat.jl.svg?label="." [strtables-loc]: https://github.com/JuliaString/StrTables.jl [strtables-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/StrTables.svg -[strtables-tvs-img]: https://travis-ci.org/JuliaString/StrTables.jl.svg?branch=master -[strtables-tvs-url]: https://travis-ci.org/JuliaString/StrTables.jl [strtables-rel]: https://img.shields.io/github/release/JuliaString/StrTables.jl.svg?label="." [strtables-dat]: https://img.shields.io/github/release-date/JuliaString/StrTables.jl.svg?label="." [html-loc]: https://github.com/JuliaString/HTML_Entities.jl [html-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/H/HTML_Entities.svg -[html-tvs-img]: https://travis-ci.org/JuliaString/HTML_Entities.jl.svg?branch=master -[html-tvs-url]: https://travis-ci.org/JuliaString/HTML_Entities.jl [html-rel]: https://img.shields.io/github/release/JuliaString/HTML_Entities.jl.svg?label="." [html-dat]: https://img.shields.io/github/release-date/JuliaString/HTML_Entities.jl.svg?label="." [emoji-loc]: https://github.com/JuliaString/Emoji_Entities.jl [emoji-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/E/Emoji_Entities.svg -[emoji-tvs-img]: https://travis-ci.org/JuliaString/Emoji_Entities.jl.svg?branch=master -[emoji-tvs-url]: https://travis-ci.org/JuliaString/Emoji_Entities.jl [emoji-rel]: https://img.shields.io/github/release/JuliaString/Emoji_Entities.jl.svg?label="." [emoji-dat]: https://img.shields.io/github/release-date/JuliaString/Emoji_Entities.jl.svg?label="." [latex-loc]: https://github.com/JuliaString/LaTeX_Entities.jl [latex-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/L/LaTeX_Entities.svg -[latex-tvs-img]: https://travis-ci.org/JuliaString/LaTeX_Entities.jl.svg?branch=master -[latex-tvs-url]: https://travis-ci.org/JuliaString/LaTeX_Entities.jl [latex-rel]: https://img.shields.io/github/release/JuliaString/LaTeX_Entities.jl.svg?label="." [latex-dat]: https://img.shields.io/github/release-date/JuliaString/LaTeX_Entities.jl.svg?label="." [unicode-loc]: https://github.com/JuliaString/Unicode_Entities.jl [unicode-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/U/Unicode_Entitites.svg -[unicode-tvs-img]: https://travis-ci.org/JuliaString/Unicode_Entities.jl.svg?branch=master -[unicode-tvs-url]: https://travis-ci.org/JuliaString/Unicode_Entities.jl [unicode-rel]: https://img.shields.io/github/release/JuliaString/Unicode_Entities.jl.svg?label="." [unicode-dat]: https://img.shields.io/github/release-date/JuliaString/Unicode_Entities.jl.svg?label="." [strentities-loc]: https://github.com/JuliaString/StrEntities.jl [strentities-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/S/StrEntities.svg -[strentities-tvs-img]: https://travis-ci.org/JuliaString/StrEntities.jl.svg?branch=master -[strentities-tvs-url]: https://travis-ci.org/JuliaString/StrEntities.jl [strentities-rel]: https://img.shields.io/github/release/JuliaString/StrEntities.jl.svg?label="." [strentities-dat]: https://img.shields.io/github/release-date/JuliaString/StrEntities.jl.svg?label="." [int-loc]: https://github.com/JuliaString/InternedStrings.jl [int-pkg]: https://juliaci.github.io/NanosoldierReports/pkgeval_badges/I/InternedStrings.svg -[int-tvs-img]: https://travis-ci.org/JuliaString/InternedStrings.jl.svg?branch=master -[int-tvs-url]: https://travis-ci.org/JuliaString/InternedStrings.jl [int-rel]: https://img.shields.io/github/release/JuliaString/InternedStrings.jl.svg?label="." [int-dat]: https://img.shields.io/github/release-date/JuliaString/InternedStrings.jl.svg?label="." -The new package [ModuleInterfaceTools](https://github.com/JuliaString/ModuleInterfaceTools.jl) is used to set up a consistent and easy to use API for most of the cooperating packages, without having to worry too much about imports, exports, using, and what functions are part of a public API, and which ones are part of the internal development API for other packages to extend. +The package [ModuleInterfaceTools](https://github.com/JuliaString/ModuleInterfaceTools.jl) is used to set up a consistent and easy to use API for most of the cooperating packages, without having to worry too much about imports, exports, using, and what functions are part of a public API, and which ones are part of the internal development API for other packages to extend. ## Architecture and Operations diff --git a/test/bench.jl b/test/bench.jl index b903355..1538743 100644 --- a/test/bench.jl +++ b/test/bench.jl @@ -1,7 +1,7 @@ #= Benchmarking routines for characters and strings -Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones +Copyright 2017-2021 Gandalf Software, Inc., Scott P. Jones Licensed under MIT License, see LICENSE.md THIS IS STILL VERY WIP AND HARDCODED! @@ -23,123 +23,10 @@ dispbench(res) # Displays the results in a pretty format isdefined(Main, :STRS_SETUP) || include("setup.jl") -@static V6_COMPAT || (using Serialization) -@static V6_COMPAT || (Base.iterate(it::Union{CodePoints,CodeUnits}) = iterate(it, 1)) - - -const inppath = "textsamples" -const gutpath = "gutenberg" -const smppath = "samples" - -const gutenbergbooks = - (("files/2600/2600-0", "English"), # War & Peace, some other languages in quotes - ("files/1400/1400-0", "English"), # Great Expectations, uses Unicode quotes - ("files/42286/42286-0", "Hungarian"), - #("files/8119/8119-0", "Polish"), # couldn't get this to load correctly - ("files/31536/31536-0", "Polish"), - ("files/32941/32941-0", "Japanese"), - ("files/24264/24264-0", "Chinese"), - ("files/40687/40687-0", "Telugu"), # Third most spoken in India, official - ("files/50513/50513-0", "French"), -# ("files/43007/43007-0", "Arabic"), - ("cache/epub/38496/pg38496", "Portuguese"), - ("cache/epub/2000/pg2000", "Spanish"), # Don Quijote - ("cache/epub/48750/pg48750", "Swedish"), - ("cache/epub/48322/pg48322", "German"), - ) - -const downloadedbooks = ( -# ("LYSAIa GORA DIeVICh'Ia - SIeRGIeI GOLOVAChIoV.txt", "Russian"), - ) - -getdefdir(dir)::String = dir === nothing ? homedir() : dir - -function filter_lines(lines) - out = Vector{String}() - sizehint!(out, length(lines)) - # Eliminate initial lines, empty lines, trailing lines - checkbeg = true - for l in lines - if sizeof(l) > 41 && starts_with(l, "***") && ends_with(l, "***") && - occurs_in(" PROJECT GUTENBERG EBOOK", l) && - occurs_in(checkbeg ? "START OF TH" : "END OF TH", l) - checkbeg || break # Found "end of" - checkbeg = false - else - push!(out, l) - end - end - out -end - -""" -Load books from Project Gutenberg site, removing lines added at beginning and end that -are not part of the book, as much as possible -""" -function load_gutenberg!(books, list, dict, gutenbergdir) - mkpath(gutenbergdir) - for (nam, lang) in list - cnt = get(dict, lang, 0) - dict[lang] = cnt + 1 - outnam = cnt == 0 ? "$lang.txt" : "$lang-$cnt.txt" - lname = joinpath(gutenbergdir, outnam) - download(joinpath("http://www.gutenberg.org/", nam * ".txt"), lname) - println("Saved to: ", lname) - push!(books, (outnam, filter_lines(readlines(lname)))) - end - books -end - -function remove_empty(lines) - # Eliminate empty lines - len = length(lines) - out = Vector{String}() - sizehint!(out, len) - for l in lines - is_empty(l) || push!(out, l) - end - out -end - -""" -load_books(; dir=nothing) - -Loads a set of books from a local directory, and downloads a set of books from Project Gutenberg -Returns them as a dictionary with names -> vectors of strings -""" -function load_books(; dir::Any=nothing) - defdir = getdefdir(dir) - inputdir = joinpath(defdir, inppath) - dict = Dict{String,Int}() - books = Vector{Tuple{String, Vector{String}}}() - for (nam, lang) in downloadedbooks - cnt = get(dict, lang, 0) - dict[lang] = cnt + 1 - outnam = cnt == 0 ? "$lang.txt" : "$lang-$cnt.txt" - push!(books, (outnam, readlines(joinpath(inputdir, nam)))) - end - load_gutenberg!(books, gutenbergbooks, dict, joinpath(defdir, gutpath)) -end +using Serialization +Base.iterate(it::Union{CodePoints,CodeUnits}) = iterate(it, 1) -""" -save_books(books; dir=nothing) - -Saves the collection of downloaded books into the given directory, in a "samples" subdirectory. -If the directory is not set, it will default to the user's home directory -""" -function save_books(books; dir::Any=nothing) - sampledir = joinpath(getdefdir(dir), smppath) - mkpath(sampledir) - for (nam, book) in books - outnam = joinpath(sampledir, nam) - open(outnam, "w") do io - for lin in book - println(io, lin) - end - println("Saved $nam") - end - end -end +include("books.jl") Base.show(io::IO, cnt::LineCounts) = pr"\(io)\%10d(cnt.bytes)\%12.3f(cnt.bytes/cnt.chars)" @@ -1009,10 +896,16 @@ function checktests(io = _stdout(); dir::Any=nothing, test::Bool=false) sampledir = joinpath(getdefdir(dir), smppath) for fname in readdir(sampledir) lines = readlines(joinpath(sampledir, fname)) - stats = calcstats(lines) list = [String, UTF8Str, UTF16Str, UTF32Str, UniStr] - MT = enctyp(stats.maxtyp) - MT != UTF32Str && push!(list, MT) + try + stats = calcstats(lines) + MT = enctyp(stats.maxtyp) + MT != UTF32Str && push!(list, MT) + catch ex + typeof(ex) == InterruptException || + pr"calcstats failed on \(fname): \(sprint(showerror, ex, catch_backtrace()))" + rethrow() + end isdefined(Main, :UTF8String) && push!(list, UTF8String, UTF16String, UTF32String) enc = encode_lines(list, lines) res = (runcheckline(Integer, lines, testlist[1][1]), @@ -1123,4 +1016,3 @@ function load_results(fname) deserialize(io) end end - diff --git a/test/books.jl b/test/books.jl new file mode 100644 index 0000000..6a92b94 --- /dev/null +++ b/test/books.jl @@ -0,0 +1,162 @@ +#= +Benchmarking routines for characters and strings + +Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones +Licensed under MIT License, see LICENSE.md + +Functions to load up sample books +=# + +const inppath = "textsamples" +const gutpath = "gutenberg" +const smppath = "samples" + +const gutenbergbooks = + (("2600/2600-0", "English"), # War & Peace, some other languages in quotes + ("1400/1400-0", "English"), # Great Expectations, uses Unicode quotes + ("42286/42286-0", "Hungarian"), + ("31536/31536-0", "Polish"), + ("32941/32941-0", "Japanese"), + ("24264/24264-0", "Chinese"), + ("40687/40687-0", "Telugu"), # Third most spoken in India, official + ("50513/50513-0", "French"), + ("43007/43007-0", "Arabic"), + ("38496/38496-8", "Portuguese"), # Latin1 + ("2000/2000-0", "Spanish"), # Don Quijote + ("48750/48750-8", "Swedish"), # Latin1 +# ("48322/48322-8", "German"), # Latin1 + ) + +const downloadedbooks = ( +# ("LYSAIa GORA DIeVICh'Ia - SIeRGIeI GOLOVAChIoV.txt", "Russian"), + ) + +getdefdir(dir)::String = dir === nothing ? homedir() : dir + +function find_beg(lines) + CSE = "Character set encoding: " + csebeg = sizeof(CSE) + 1 + last = length(lines) + cse = "" + ln = 0 + while ln < last + l = lines[ln += 1] + if starts_with(l, CSE) + cse = l[csebeg:end] + elseif sizeof(l) > 41 && starts_with(l, "***") && ends_with(l, "***") && + occurs_in(" PROJECT GUTENBERG EBOOK", l) && + occurs_in("START OF TH", l) + # Skip over empty lines + while ln < last && (l = lines[ln += 1]) == "" ; end + if starts_with(l, "Produced by ") + # Skip over non-empty lines after "Produced by" + while ln < last && lines[ln += 1] != "" ; end + # Skip over empty lines after "Produced by" + while ln < last && lines[ln += 1] == "" ; end + end + return (ln, cse) + end + end + (1, cse) +end + +function find_end(lines, beg) + ln = last = length(lines) + while ln > beg + l = lines[ln] + if sizeof(l) > 41 && starts_with(l, "***") && ends_with(l, "***") && + occurs_in(" PROJECT GUTENBERG EBOOK", l) && + occurs_in("END OF TH", l) + #print("Found END OF at line $ln") + while ln > beg && (l = lines[ln -= 1]) == "" ; end + #println(" => $ln") + if starts_with(l, "End of ") && occurs_in("Project Gutenberg", l) + #print("Found End of at line $ln") + while ln > beg && lines[ln -= 1] == "" ; end + #println(" => $ln") + end + return ln + end + ln -= 1 + end + last +end + +""" +Load books from Project Gutenberg site, removing lines added at beginning and end that +are not part of the book, as much as possible +""" +function load_gutenberg!(books, list, dict, gutenbergdir) + mkpath(gutenbergdir) + for (nam, lang) in list + cnt = get(dict, lang, 0) + dict[lang] = cnt + 1 + outnam = cnt == 0 ? "$lang.txt" : "$lang-$cnt.txt" + lname = joinpath(gutenbergdir, outnam) + download(string("http://www.gutenberg.org/files/", nam, ".txt"), lname) + println("Saved to: ", lname) + lines = readlines(lname) + (beg, cse) = find_beg(lines) + lst = find_end(lines, beg) + if cse == "ISO-8859-1" + filt = [convert(String, Str(LatinCSE, lines[ln])) for ln in beg:lst] + elseif cse == "UTF-8" || cse == "" + filt = lines[beg:lst] + else + error("Unknown character set encoding: $cse") + end + push!(books, (outnam, filt)) + end + books +end + +function remove_empty(lines) + # Eliminate empty lines + len = length(lines) + out = Vector{String}() + sizehint!(out, len) + for l in lines + is_empty(l) || push!(out, l) + end + out +end + +""" +load_books(; dir=nothing) + +Loads a set of books from a local directory, and downloads a set of books from Project Gutenberg +Returns them as a dictionary with names -> vectors of strings +""" +function load_books(; dir::Any=nothing) + defdir = getdefdir(dir) + inputdir = joinpath(defdir, inppath) + dict = Dict{String,Int}() + books = Vector{Tuple{String, Vector{String}}}() + for (nam, lang) in downloadedbooks + cnt = get(dict, lang, 0) + dict[lang] = cnt + 1 + outnam = cnt == 0 ? "$lang.txt" : "$lang-$cnt.txt" + push!(books, (outnam, readlines(joinpath(inputdir, nam)))) + end + load_gutenberg!(books, gutenbergbooks, dict, joinpath(defdir, gutpath)) +end + +""" +save_books(books; dir=nothing) + +Saves the collection of downloaded books into the given directory, in a "samples" subdirectory. +If the directory is not set, it will default to the user's home directory +""" +function save_books(books; dir::Any=nothing) + sampledir = joinpath(getdefdir(dir), smppath) + mkpath(sampledir) + for (nam, book) in books + outnam = joinpath(sampledir, nam) + open(outnam, "w") do io + for lin in book + println(io, lin) + end + println("Saved $nam") + end + end +end