-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocessing.py
49 lines (37 loc) · 1.66 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import re
from itertools import islice
def chunk_by_paragraph(text: str):
no_newlines = text.strip("\n") # remove leading and trailing "\n"
split_text = (re.compile(r"\n{1, }")).split(no_newlines)
paragraphs = [p + "\n" for p in split_text if p.strip()]
# p + "\n" ensures that all lines in the paragraph end with a newline
# p.strip() == True if paragraph has other characters than whitespace
return {k: v for k, v in enumerate(paragraphs)}
def chunk_by_sentence_and_len(text: str, threshold: int = 200):
out = []
for chunk in text.split('. '):
if out and len(chunk) + len(out[-1]) < threshold:
out[-1] += ' ' + chunk + '.'
else:
out.append(chunk + '.')
return {k: v for k, v in enumerate(out)}
# return out
def chunk_by_overlap_windows(text: str, threshold: int = 200):
def chunk(text: str, threshold: int = 200):
itr = iter(text)
res = tuple(islice(itr, threshold))
if len(res) == threshold:
yield res
for ele in itr:
res = res[1:] + (ele,)
yield res
res = ["".join(ele) for ele, i in zip(chunk(text=text, threshold=threshold))]
return {k: v for k, v in enumerate(res)}
def chunk_by_sentance(text: str):
splits = text.split('.')
return {k: v for k, v in enumerate(splits)}
sample_text = 'Albert Einstein was a German-born theoretical physicist who developed the theory of relativity, ' \
'one of the two pillars of modern physics. ' \
'His work is also known for its influence on the philosophy of science.'
# sample_text = chunk_by_sentence_and_len(sample_text)
# print(sample_text)