|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +function showhelp { |
| 4 | + echo |
| 5 | + echo "Script to serialize a big RDF file in n-triples format into HDT" |
| 6 | + echo "It splits the file in N parts, compress each one with rdf2hdt, and merges them iteratively with hdtCat." |
| 7 | + echo |
| 8 | + echo "Usage $0 [OPTION]" |
| 9 | + echo |
| 10 | + echo " -c, --catscript location of hdtCat script (assuming it's in PATH by by default)" |
| 11 | + echo " -i, --input input file (input.rdf by default)" |
| 12 | + echo " -h, --help shows this help and exits" |
| 13 | + echo " -n, --number number of files to split FILE (2 by default)" |
| 14 | + echo " -o, --output output file (output.hdt by default)" |
| 15 | + echo " -p, --parallel number of threads to serialize RDF into HDT in parallel (1 by default)" |
| 16 | + echo " -r, --rdf2hdt location of rdf2hdt script (assuming it's in PATH by default)" |
| 17 | + echo |
| 18 | +} |
| 19 | + |
| 20 | +# Defaults |
| 21 | +declare rdf2hdt="rdf2hdt.sh" |
| 22 | +declare hdtCat="hdtCat.sh" |
| 23 | +declare input="input.rdf" |
| 24 | +declare -i lines |
| 25 | +declare output="output.hdt" |
| 26 | +declare -i splits=2 |
| 27 | +declare -i threads=1 |
| 28 | + |
| 29 | +getopt --test > /dev/null |
| 30 | +if [[ $? -eq 4 ]]; then |
| 31 | + # enhanced getopt works |
| 32 | + OPTIONS=c:i:hn:o:p:r: |
| 33 | + LONGOPTIONS=cat:,input:,help,number:,output:,parallel:,rdf2hdt: |
| 34 | + COMMAND=$(getopt -o $OPTIONS -l $LONGOPTIONS -n "$0" -- "$@") |
| 35 | + if [[ $? -ne 0 ]]; then |
| 36 | + exit 2 |
| 37 | + fi |
| 38 | + eval set -- "$COMMAND" |
| 39 | +else |
| 40 | + echo "Enhanced getopt not supported. Brace yourself, this is not tested, but it should work :-)" |
| 41 | +fi |
| 42 | + |
| 43 | +while true; do |
| 44 | + case "$1" in |
| 45 | + -c|--cat) |
| 46 | + hdtCat=$2 |
| 47 | + shift 2 |
| 48 | + ;; |
| 49 | + -i|--input) |
| 50 | + input=$2 |
| 51 | + shift 2 |
| 52 | + ;; |
| 53 | + -n|--number) |
| 54 | + splits=$2 |
| 55 | + shift 2 |
| 56 | + ;; |
| 57 | + -o|--output) |
| 58 | + output=$2 |
| 59 | + shift 2 |
| 60 | + ;; |
| 61 | + -p|--parallel) |
| 62 | + threads=$2 |
| 63 | + shift 2 |
| 64 | + ;; |
| 65 | + -r|--rdf2hdt) |
| 66 | + rdf2hdt=$2 |
| 67 | + shift 2 |
| 68 | + ;; |
| 69 | + --) |
| 70 | + shift |
| 71 | + break |
| 72 | + ;; |
| 73 | + *) |
| 74 | + showhelp |
| 75 | + exit 0 |
| 76 | + ;; |
| 77 | + esac |
| 78 | +done |
| 79 | + |
| 80 | +total_lines=$(wc -l < $input) |
| 81 | +lines=($total_lines+$splits-1)/$splits #Set number of lines rounding up |
| 82 | + |
| 83 | +split -l $lines $input "$input"_split_ |
| 84 | + |
| 85 | +echo "***************************************************************" |
| 86 | +echo "Serializing into HDT $splits files using $threads threads" |
| 87 | +echo "***************************************************************" |
| 88 | +echo -n "$input"_split_* | xargs -I{} -d' ' -P$threads $rdf2hdt -rdftype ntriples {} {}_"$splits".hdt |
| 89 | + |
| 90 | +for (( i=$splits; i>1; i=i/2 )); do |
| 91 | + echo "***************************************************************" |
| 92 | + echo "Merging $i hdt files: " "$input"_split_*_"$i".hdt |
| 93 | + echo "***************************************************************" |
| 94 | + command='temp=${2%_*.hdt} ; '$hdtCat' $1 $2 ${1%_*.hdt}_${temp#*split_}_$0.hdt' |
| 95 | + echo -n "$input"_split_*_"$i".hdt | xargs -d' ' -n2 -P$threads bash -c "$command" $((i/2)) |
| 96 | +done |
| 97 | + |
| 98 | +echo "***************************************************************" |
| 99 | +echo "Moving output to '$output' file" |
| 100 | +echo "***************************************************************" |
| 101 | +mv "$input"_split*_1.hdt "$output" |
| 102 | + |
| 103 | +echo "***************************************************************" |
| 104 | +echo "Cleaning up split files" |
| 105 | +echo "***************************************************************" |
| 106 | +rm "$input"_split_* |
0 commit comments