Skip to content

Commit df06c92

Browse files
authored
Merge pull request #1 from biobricks-ai/hdt-java-rdf2hdtcat
Add rdf2hdtcat.sh from hdt-java
2 parents d9eab20 + 7ba2b70 commit df06c92

File tree

1 file changed

+106
-0
lines changed

1 file changed

+106
-0
lines changed

bin/hdt-java-rdf2hdtcat.sh

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/bin/bash
2+
3+
function showhelp {
4+
echo
5+
echo "Script to serialize a big RDF file in n-triples format into HDT"
6+
echo "It splits the file in N parts, compress each one with rdf2hdt, and merges them iteratively with hdtCat."
7+
echo
8+
echo "Usage $0 [OPTION]"
9+
echo
10+
echo " -c, --catscript location of hdtCat script (assuming it's in PATH by by default)"
11+
echo " -i, --input input file (input.rdf by default)"
12+
echo " -h, --help shows this help and exits"
13+
echo " -n, --number number of files to split FILE (2 by default)"
14+
echo " -o, --output output file (output.hdt by default)"
15+
echo " -p, --parallel number of threads to serialize RDF into HDT in parallel (1 by default)"
16+
echo " -r, --rdf2hdt location of rdf2hdt script (assuming it's in PATH by default)"
17+
echo
18+
}
19+
20+
# Defaults
21+
declare rdf2hdt="rdf2hdt.sh"
22+
declare hdtCat="hdtCat.sh"
23+
declare input="input.rdf"
24+
declare -i lines
25+
declare output="output.hdt"
26+
declare -i splits=2
27+
declare -i threads=1
28+
29+
getopt --test > /dev/null
30+
if [[ $? -eq 4 ]]; then
31+
# enhanced getopt works
32+
OPTIONS=c:i:hn:o:p:r:
33+
LONGOPTIONS=cat:,input:,help,number:,output:,parallel:,rdf2hdt:
34+
COMMAND=$(getopt -o $OPTIONS -l $LONGOPTIONS -n "$0" -- "$@")
35+
if [[ $? -ne 0 ]]; then
36+
exit 2
37+
fi
38+
eval set -- "$COMMAND"
39+
else
40+
echo "Enhanced getopt not supported. Brace yourself, this is not tested, but it should work :-)"
41+
fi
42+
43+
while true; do
44+
case "$1" in
45+
-c|--cat)
46+
hdtCat=$2
47+
shift 2
48+
;;
49+
-i|--input)
50+
input=$2
51+
shift 2
52+
;;
53+
-n|--number)
54+
splits=$2
55+
shift 2
56+
;;
57+
-o|--output)
58+
output=$2
59+
shift 2
60+
;;
61+
-p|--parallel)
62+
threads=$2
63+
shift 2
64+
;;
65+
-r|--rdf2hdt)
66+
rdf2hdt=$2
67+
shift 2
68+
;;
69+
--)
70+
shift
71+
break
72+
;;
73+
*)
74+
showhelp
75+
exit 0
76+
;;
77+
esac
78+
done
79+
80+
total_lines=$(wc -l < $input)
81+
lines=($total_lines+$splits-1)/$splits #Set number of lines rounding up
82+
83+
split -l $lines $input "$input"_split_
84+
85+
echo "***************************************************************"
86+
echo "Serializing into HDT $splits files using $threads threads"
87+
echo "***************************************************************"
88+
echo -n "$input"_split_* | xargs -I{} -d' ' -P$threads $rdf2hdt -rdftype ntriples {} {}_"$splits".hdt
89+
90+
for (( i=$splits; i>1; i=i/2 )); do
91+
echo "***************************************************************"
92+
echo "Merging $i hdt files: " "$input"_split_*_"$i".hdt
93+
echo "***************************************************************"
94+
command='temp=${2%_*.hdt} ; '$hdtCat' $1 $2 ${1%_*.hdt}_${temp#*split_}_$0.hdt'
95+
echo -n "$input"_split_*_"$i".hdt | xargs -d' ' -n2 -P$threads bash -c "$command" $((i/2))
96+
done
97+
98+
echo "***************************************************************"
99+
echo "Moving output to '$output' file"
100+
echo "***************************************************************"
101+
mv "$input"_split*_1.hdt "$output"
102+
103+
echo "***************************************************************"
104+
echo "Cleaning up split files"
105+
echo "***************************************************************"
106+
rm "$input"_split_*

0 commit comments

Comments
 (0)