-
Notifications
You must be signed in to change notification settings - Fork 1
/
phrase.py
66 lines (58 loc) · 1.44 KB
/
phrase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
import pandas as pd
import json
from collections import defaultdict
import nltk
from nltk.translate.phrase_based import phrase_extraction
from model1 import *
with open('data1.json') as f:
data = json.load(f)
basicList=final_list
#with open('alignment.txt', 'r') as f:
# basicList = txt.load(f)
#basicList=basicList[1:-1]
#print(basic)
sentence_pair=[]
for x in data[:]:
sentence_pair.append([x['en'],x['fr']])
count=defaultdict(lambda:defaultdict(int))
list=[]
dict=[]
temp=0
al=[]
for x in sentence_pair:
en=x[0]
fr=x[1]
al=basicList[temp]
# print(al)
temp+=1
# al=[(0,0), (1,1), (2,2), (3,3), (4,4)]
#Using nltk phrase_extraction to automatically generate phrases as tuples
phrases= phrase_extraction(en,fr,al)
for x in phrases:
dict.append(x)
# print("m")
for i in dict:
fr=i[3]
en=i[2]
count[en][fr]+=1
#Creating a final dictionary sorted according to value
sortedDict={}
for en in count:
# length= len(count[en])
length=0
for fr in count[en]:
length+= count[en][fr]
for fr in count[en]:
num=count[en][fr]
#probability of each phrase
prob=float(num)/float(length)
sortedDict[fr+','+en]=prob
list.append(fr+','+en+': '+str(prob))
print(final_list)
s=sorted(sortedDict.items(),key=lambda x:x[1],reverse=True)
for key,val in s:
# if(val>1.0):
# continue
print(key+': '+str(val))
#print(list)