diff --git a/article_to_csv.py b/article_to_csv.py deleted file mode 100644 index f415e65..0000000 --- a/article_to_csv.py +++ /dev/null @@ -1,98 +0,0 @@ -## Carter Merenstein -## Middlebury College -''' Reads in text files downloaded from factiva, filters out common words -stems words, and then outputs a csv of article, date, and frequency of words.''' - - -import csv -import os -import re -from nltk import PorterStemmer - -pattern = re.compile('\W|\d') # not an alphanumeric - -articles = {} #articles with words -article_dates = {} -all_words = {} -article_sources = {} #document number corresponds to source - -txt_paths = "C:\\CDocs\\Annas_thesis\\txtgroups" - -def isNewArticle(line_arr): - ''' the easiest identifier for a new article is that they start with a wordcount, e.g. "999 words" - takes in a line already split into a list''' - try: - condition2 = (line_arr[1].strip('\n') == 'words') ## needs to be separate - except: - return False # an error occurs if there is < 2 words. In this case we know it's not a new article - return (re.match('[0-9]', line_arr[0]) != None and condition2) - -for txt in os.listdir(txt_paths): - started = False #can't classify things before the title - new_article = False #used to pull name, source, date from articles - filename = txt_paths + '\\' + txt - with open(filename, 'r', encoding = "utf8") as group: - i = 0 - article_title = '' #separate out by article - two_lines_ago = '' #always hold this to be able to snatch the title - last_line = '' - date = '' #always after wordcount - source = txt.strip('.txt') # just a number, Anna can sort later - for line in group: - line_arr = line.split(' ') - if isNewArticle(line_arr): - article_title = two_lines_ago - articles[article_title] = {} - i += 1 - started = True # so we don't get an error on the first few lines - new_article = True - elif started: #just if we're past the first 2 lines of the document - for word in line_arr: - if new_article: - date = line - article_dates[article_title] = date - new_article = False - article_sources[article_title] = source - else: - word = re.sub(pattern, '', word) #get rid of commas and stuff - word = word.lower() - word = PorterStemmer().stem_word(word) #get root of word - try: - articles[article_title][word] += 1 - all_words[word] += 1 - except: - articles[article_title][word] = 1 - try: - all_words[word] += 1 - except: - all_words[word] = 0 - two_lines_ago = last_line - last_line = line - - -with open('stemmedWordFreq.csv ', 'w', newline='', encoding="utf8") as out: - w = csv.writer(out) - article_names = [] - header = [''] - for article in articles.keys(): - article_names.append(article) - header.append(article) - w.writerow(header) - article_dates_row = [''] #another row for the dates - for article in article_names: - article_dates_row.append(article_dates[article]) #put the date in the row below the article - w.writerow(article_dates_row) - article_source_row = [''] - for article in article_names: - article_source_row.append(article_sources[article]) - w.writerow(article_source_row) - for word in all_words.keys(): - if all_words[word] >= 100: - line = [word] - for article in article_names: - try: - line.append(articles[article][word]) - - except KeyError: - line.append(0) -w.writerow(line) \ No newline at end of file diff --git a/main.py b/main.py index 3d5720e..b2a015c 100644 --- a/main.py +++ b/main.py @@ -152,10 +152,12 @@ def main(self, path): continue else: + """ degrees = self.get_rotation_info(image_file_name) if degrees: self.fix_dpi_and_rotation(image_file_name, degrees, ext) + """ call(["tesseract", image_file_name, text_file_path], stdout=FNULL) #Fetch tesseract with FNULL in write mode diff --git a/ocr.py b/ocr.py deleted file mode 100644 index d6f8819..0000000 --- a/ocr.py +++ /dev/null @@ -1,47 +0,0 @@ -# import the necessary packages -import argparse -import os - -import pytesseract -from PIL import Image - -import cv2 - -# construct the argument parse and parse the arguments -ap = argparse.ArgumentParser() -ap.add_argument("-i", "--image", required=True, - help="path to input image to be OCR'd") -ap.add_argument("-p", "--preprocess", type=str, default="thresh", - help="type of preprocessing to be done") -args = vars(ap.parse_args()) - -# load the example image and convert it to grayscale -image = cv2.imread(args["image"]) -gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - -# check to see if we should apply thresholding to preprocess the -# image -if args["preprocess"] == "thresh": - gray = cv2.threshold(gray, 0, 255, - cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - -# make a check to see if median blurring should be done to remove -# noise -elif args["preprocess"] == "blur": - gray = cv2.medianBlur(gray, 3) - -# write the grayscale image to disk as a temporary file so we can -# apply OCR to it -filename = "{}.png".format(os.getpid()) -cv2.imwrite(filename, gray) - -# load the image as a PIL/Pillow image, apply OCR, and then delete -# the temporary file -text = pytesseract.image_to_string(Image.open(filename)) -os.remove(filename) -print(text) - -# show the output images -cv2.imshow("Image", image) -cv2.imshow("Output", gray) -cv2.waitKey(0) diff --git a/rotation.py b/rotation.py deleted file mode 100644 index c721ad1..0000000 --- a/rotation.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -import subprocess -import PIL.Image as Image - -from glob import glob - -command = 'c:\\Share\\tesseract.exe' -image = '337.jpg' -DPI = 300 -arguments = ' %s - -psm 0' - - -def get_rotation_info(filename): - stdoutdata = subprocess.getoutput(command + arguments % filename) - degrees = None - for line in stdoutdata.splitlines(): - info = 'Orientation in degrees: ' - if info in line: - degrees = -float(line.replace(info, '').strip()) - #print("Found rotation: %.2f" % degrees) - return degrees - -def fix_dpi_and_rotation(filename, degrees, dpi_info): - im1 = Image.open(filename) - print('Fixing rotation %.2f in %s...' % (degrees, filename)) - im1.rotate(degrees).save('../%s' % filename, - 'JPEG', quality=97, dpi = (dpi_info, dpi_info)) - -filenames = sorted(glob('*.jpg')) -for filename in filenames: - print('Checking %s...' % filename) - degrees = get_rotation_info(filename) - if degrees: - fix_dpi_and_rotation(filename, degrees, DPI) \ No newline at end of file diff --git a/similarity.py b/similarity.py deleted file mode 100644 index 7814da9..0000000 --- a/similarity.py +++ /dev/null @@ -1,169 +0,0 @@ -from nltk.tokenize import sent_tokenize, word_tokenize -from nltk.corpus import stopwords,wordnet -from nltk.stem import WordNetLemmatizer -from itertools import product -import numpy - -# str1 = "Abhishek is a good boy." -# str2 = "Abhishek is not a bad boy." -# str1 = "Cat is drinking water." -# str2 = "Lions eat flesh." -# str1 = "He loves to play football." -# str2 = "Football is his favourite sport." -# str1 = "Many consider Maradona as the best player in soccer history." -# str2 = "Maradona is one of the best soccer player." - -str1 = "I was given a card by her in the garden." -str2 = "In the garden, she gave me a card." - -# str1 = "Ballmer has been vocal in the past warning that Linux is a threat to Microsoft." -# str2 = "In the memo, Ballmer reiterated the open-source threat to Microsoft." -# str1 = "The boy is fetching water from the well." -# str2 = "The lion is running in the forest." -# str1 = "A school is a place where kids go to study." -# str2 = "School is an institution for children who want to study." -# str1 = "The world knows it has lost a heroic champion of justice and freedom." -# str2 = "The earth recognizes the loss of a valiant champion of independence and justice." -# str1 = "A cemetery is a place where dead people's bodies or their ashes are buried." -# str2 = "A graveyard is an area of land ,sometimes near a church, where dead people are buried." - -##---------------Defining stopwords for English Language---------------## -stop_words = set(stopwords.words("english")) - -##---------------Initialising Lists---------------## -filtered_sentence1 = [] -filtered_sentence2 = [] -lemm_sentence1 = [] -lemm_sentence2 = [] -sims = [] -temp1 = [] -temp2 = [] -simi = [] -final = [] -same_sent1 = [] -same_sent2 = [] -#ps = PorterStemmer() - -##---------------Defining WordNet Lematizer for English Language---------------## -lemmatizer = WordNetLemmatizer() - -#myfile = open('Text1.txt', 'r') -#data=myfile.read().replace('\n', '') -##print(sent_tokenize(example_text)) -## -##print(word_tokenize(example_text)) - -##---------------Tokenizing and removing the Stopwords---------------## - -for words1 in word_tokenize(str1): - if words1 not in stop_words: - if words1.isalnum(): - filtered_sentence1.append(words1) - -##---------------Lemmatizing: Root Words---------------## - -for i in filtered_sentence1: - lemm_sentence1.append(lemmatizer.lemmatize(i)) - -#print(lemm_sentence1) - - -##---------------Tokenizing and removing the Stopwords---------------## - -for words2 in word_tokenize(str2): - if words2 not in stop_words: - if words2.isalnum(): - filtered_sentence2.append(words2) - -##---------------Lemmatizing: Root Words---------------## - -for i in filtered_sentence2: - lemm_sentence2.append(lemmatizer.lemmatize(i)) - -#print(lemm_sentence2) - -##---------------Removing the same words from the tokens----------------## -##for word1 in lemm_sentence1: -## for word2 in lemm_sentence2: -## if word1 == word2: -## same_sent1.append(word1) -## same_sent2.append(word2) -## -##if same_sent1 != []: -## for word1 in same_sent1: -## lemm_sentence1.remove(word1) -##if same_sent2 != []: -## for word2 in same_sent2: -## lemm_sentence2.remove(word2) -## -##print(lemm_sentence1) -##print(lemm_sentence2) - -##---------------Similarity index calculation for each word---------------## -for word1 in lemm_sentence1: - simi =[] - for word2 in lemm_sentence2: - sims = [] - # print(word1) - #print(word2) - syns1 = wordnet.synsets(word1) - #print(syns1) - #print(wordFromList1[0]) - syns2 = wordnet.synsets(word2) - #print(wordFromList2[0]) - for sense1, sense2 in product(syns1, syns2): - d = wordnet.wup_similarity(sense1, sense2) - if d != None: - sims.append(d) - - #print(sims) - #print(max(sims)) - if sims != []: - max_sim = max(sims) - #print(max_sim) - simi.append(max_sim) - - if simi != []: - max_final = max(simi) - final.append(max_final) - -#print(final) - -# if max_sim >= 0.7: -# print(word1) -# print(word2) -# print('\n') - -# if word1 not in temp1: -# temp1.append(word1) -# if word2 not in temp2: -# temp2.append(word2) - #lemm_sentence1.remove(word1) - #lemm_sentence2.remove(word2) - #if wordFromList1 and wordFromList2: #Thanks to @alexis' note - # s = wordFromList1[0].wup_similarity(wordFromList2[0]) - # list.append(s) -#for word1 in temp1: -# lemm_sentence1.remove(word1) - -#for word2 in temp2: -# lemm_sentence2.remove(word2) - -#print(lemm_sentence1) -#print(lemm_sentence2) - - -##---------------Final Output---------------## - -similarity_index = numpy.mean(final) -similarity_index = round(similarity_index , 2) -print("Sentence 1: ",str1) -print("Sentence 2: ",str2) -print("Similarity index value : ", similarity_index) - -if similarity_index>0.8: - print("Similar") -elif similarity_index>=0.6: - print("Somewhat Similar") -else: - print("Not Similar") \ No newline at end of file