-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathextract_relations_between_entities.py
57 lines (47 loc) · 2.29 KB
/
extract_relations_between_entities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import stanza
import chardet
from stanza.server import CoreNLPClient
from stanza.server.client import PermanentlyFailedException
# Download the Stanford CoreNLP package and English models
stanza.install_corenlp()
stanza.download_corenlp_models(model='english-kbp', version='4.2.0')
# Create a client object that uses the CoreNLP server and includes the kbp processor
client = CoreNLPClient(annotators="tokenize,pos,lemma,depparse,sentiment,ner,kbp".split(), kbp_model_name='english-kbp', timeout=600000, memory='6G')
# Create the output directory if it doesn't exist
os.makedirs("C:\\python\\autoindex\\relation_extraction\\", exist_ok=True)
# Iterate through the txt files in C:\python\autoindex\txt_output.
for filename in os.listdir("C:\\python\\autoindex\\txt_output"):
# Detect the file encoding.
with open(f"C:\\python\\autoindex\\txt_output\\{filename}", "rb") as f:
file_bytes = f.read()
detected = chardet.detect(file_bytes)
encoding = detected['encoding']
# Read the file using the detected encoding.
with open(f"C:\\python\\autoindex\\txt_output\\{filename}", "r", encoding=encoding, errors="replace") as f:
document = f.read()
# Try to annotate the document with the client
try:
doc = client.annotate(document)
except PermanentlyFailedException as e:
# If the server cannot start because of port conflict, try to stop the previous server and start a new one
if "unable to start the CoreNLP server on port" in str(e):
print("Trying to stop the previous server and start a new one...")
client.stop()
client.start()
doc = client.annotate(document)
else:
# If the error is not related to port conflict, raise it
raise e
# Extract the relations from the document.
relations = []
for sentence in doc.sentence:
relations.extend(sentence.relation)
# Split the filename and the extension
filename, extension = os.path.splitext(filename)
# Write the relations to a file using the filename without the extension
with open(f"C:\\python\\autoindex\\relation_extraction\\{filename}_relations.txt", "w") as f:
for relation in relations:
f.write(str(relation) + "\n")
# Close the client when done
client.close()