-
Notifications
You must be signed in to change notification settings - Fork 0
/
documentSimilarity.py
58 lines (46 loc) · 2.99 KB
/
documentSimilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Create a list of punctuations to compare and discard these stopwords from the sentence
punctuations = {".",",","!","'"}
# Create a list of english stopwords to compare and discard these stopwords from the sentence
stopwords = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "you'll","he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}
# function to parse the original sentence after removing punctuations and stopwords to have a better similarity score
def filter_sentences(s):
# Create an empty set to store the final tokenized words from the sentence after all pre-processing
new_sent = set()
s = s.strip()
# words tokenization
words = s.split(" ")
for w in words:
# lowercase all the texts
w = w.lower()
tmp = ""
for ch in w:
if ch not in punctuations:
tmp += ch
if tmp not in stopwords:
new_sent.add(tmp)
return new_sent
def check_similarity(sentence1, sentence2):
# Implementing Jaccard algorithm for calculating similarity scores between two texts
sentence_intersection = sentence1.intersection(sentence2)
sentence_union = sentence1.union(sentence2)
similarity_score = len(sentence_intersection)/len(sentence_union)
return similarity_score
if __name__=="__main__":
s1 = input("Enter first sentence: \n")
print()
s2 = input("Enter second sentence: \n")
print()
s3 = input("Enter third sentence: \n")
print()
parsed_sentence1 = filter_sentences(s1)
parsed_sentence2 = filter_sentences(s2)
parsed_sentence3 = filter_sentences(s3)
score1 = check_similarity(parsed_sentence1, parsed_sentence2)
score2 = check_similarity(parsed_sentence1, parsed_sentence3)
print("Similarity score for first & second sentence: ", score1)
print("Similarity score for first & third sentence: ", score2)
print()
if score1 > score2:
print("The first and second sentence are more similar than first and third sentence.")
else:
print("The first and third sentence are more similar than first and second sentence.")