-
Notifications
You must be signed in to change notification settings - Fork 703
/
Copy pathelasticbook.py
108 lines (84 loc) · 2.65 KB
/
elasticbook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# https://www.pg4e.com/code/elasticbook.py
# Download a book
# wget http://www.gutenberg.org/cache/epub/18866/pg18866.txt
# wget http://www.gutenberg.org/cache/epub/14091/pg14091.txt
# wget https://www.gutenberg.org/files/2591/2591-0.txt
# wget https://www.gutenberg.org/files/11/11-0.txt
# (If needed)
# https://www.pg4e.com/code/hidden-dist.py
# copy hidden-dist.py to hidden.py
# edit hidden.py and put in your credentials
# python3 elasticbook.py
from elasticsearch import Elasticsearch
from elasticsearch import RequestsHttpConnection
import time
import copy
import hidden
import uuid
import json
import hashlib
bookfile = input("Enter book file (i.e. pg18866.txt): ")
if bookfile == '' : bookfile = 'pg18866.txt';
indexname = bookfile.split('.')[0]
# Make sure we can open the file
fhand = open(bookfile)
# Load the secrets
secrets = hidden.elastic()
es = Elasticsearch(
[ secrets['host'] ],
http_auth=(secrets['user'], secrets['pass']),
url_prefix = secrets['prefix'],
scheme=secrets['scheme'],
port=secrets['port'],
connection_class=RequestsHttpConnection,
)
indexname = secrets['user']
# Start fresh
# https://elasticsearch-py.readthedocs.io/en/master/api.html#indices
res = es.indices.delete(index=indexname, ignore=[400, 404])
print("Dropped index", indexname)
print(res)
res = es.indices.create(index=indexname)
print("Created the index...")
print(res)
para = ''
chars = 0
count = 0
pcount = 0
for line in fhand:
count = count + 1
line = line.strip()
chars = chars + len(line)
if line == '' and para == '' : continue
if line == '' :
pcount = pcount + 1
doc = {
'offset' : pcount,
'content': para
}
# Use the paragraph count as primary key
# pkey = pcount
# Use a GUID for the primary key
# pkey = uuid.uuid4()
# Compute a SHA256 of the entire document as the primary key.
# Because the pkey is a based on the document contents
# the "index" is in effect INSERT ON CONFLICT UPDATE unless
# the document contents change
m = hashlib.sha256()
m.update(json.dumps(doc).encode())
pkey = m.hexdigest()
res = es.index(index=indexname, id=pkey, body=doc, doc_type="None")
print('Added document', pkey)
# print(res['result'])
if pcount % 100 == 0 :
print(pcount, 'loaded...')
time.sleep(1)
para = ''
continue
para = para + ' ' + line
# Tell it to recompute the index
res = es.indices.refresh(index=indexname)
print("Index refreshed", indexname)
print(res)
print(' ')
print('Loaded',pcount,'paragraphs',count,'lines',chars,'characters')