-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstatistics.py
45 lines (31 loc) · 1.34 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import numpy as np
import pandas as pd
import helper_functions as hpf
def count_words(text: str) -> int:
return len(text.split())
def calculate_ioc(input_array: np.ndarray) -> float:
_, counts = np.unique(input_array, return_counts=True)
ioc = 0
for counted in counts:
ioc += counted * (counted - 1)
ioc = ioc / (len(input_array) * (len(input_array) - 1) / 29)
return ioc
def count_doublets(input_array) -> tuple[np.ndarray, float]:
number_of_doublets = np.sum(input_array[0:-1] == input_array[1::])
return number_of_doublets, (number_of_doublets / (len(input_array) - 1) * 100)
def gp_sums(input_text) -> list[int]:
gp_sums_list = []
for word in input_text.split():
gp_sums_list.append(hpf.gp_sum(word))
return gp_sums_list
def calculate_quadgram_score(input_text: np.ndarray) -> np.ndarray:
len_text = len(input_text)
probabilities = read_data_from_file("new_quadgrams.txt")
indices = np.array(
[input_text[0:len_text - 3] * 24389, input_text[1:len_text - 2] * 841, input_text[2:len_text - 1] * 29,
input_text[3:len_text]])
score = np.sum(probabilities[np.sum(indices, axis=0)])
return score
def read_data_from_file(file_name: str) -> np.ndarray:
df = pd.read_csv("new_quadgrams.txt", sep=',', header=None, usecols=[4])
return df.to_numpy().flatten()