Skip to content

Commit 25de044

Browse files
authored
Merge pull request #18 from infoculture/upgrade
Апгрейд на новый питон и новые зависимости
2 parents 9be1fff + cb4ba33 commit 25de044

File tree

8 files changed

+709
-174
lines changed

8 files changed

+709
-174
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.env
2+
.idea
3+
__pycache__/
4+
var/
5+
.venv

api/readabilityio.py

+14-21
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,33 @@
1-
import tornado.httpserver
2-
import tornado.ioloop
1+
import json
2+
import time
3+
34
import tornado.web
4-
#import memcache
5-
import chardet
6-
import urllib
75
import html2text
86
import requests
9-
import json
10-
from django.utils.feedgenerator import Rss201rev2Feed, Atom1Feed
11-
from pymongo import Connection
7+
8+
from pymongo import MongoClient, ASCENDING
129
from readability.readability import Document
10+
11+
from settings import MONGO_HOST, MONGO_PORT
1312
from textmetric.metric import calc_readability_metrics
14-
import time
13+
1514

1615
READ_DB = 'readability'
1716
LOG_COLL = 'log'
1817

1918
ERROR_NONE = 0
2019
ERROR_INVALID_DATA = 101
2120

21+
2222
class RusMeasureHandler(tornado.web.RequestHandler):
2323
def initialize(self):
24-
self.conn = Connection()
24+
self.conn = MongoClient(MONGO_HOST, MONGO_PORT)
2525
self.db = self.conn[READ_DB]
2626
self.log = self.db[LOG_COLL]
27-
self.log.ensure_index("reqtime", 1)
27+
self.log.create_index([("reqtime", ASCENDING)])
2828

2929
def __log(self, logrec):
30-
self.conn = Connection()
31-
self.db = self.conn[READ_DB]
32-
self.log = self.db[LOG_COLL]
33-
self.log.save(logrec)
34-
30+
self.log.insert_one(logrec)
3531

3632
def get(self):
3733
rtime = time.time()
@@ -41,7 +37,7 @@ def get(self):
4137
debug = int(debug) if debug.isdigit() else 0
4238
r = requests.get(url)
4339
ctype = r.headers['content-type'].lower() if 'content-type' in r.headers.keys() else 'text/html'
44-
print ctype
40+
print(ctype)
4541
ctype = ctype.split(';', 1)[0]
4642
if ctype == 'text/html':
4743
ht = html2text.HTML2Text()
@@ -51,8 +47,7 @@ def get(self):
5147
text = ht.handle(Document(r.text).summary())
5248
status = ERROR_NONE
5349
elif ctype == 'text/plain':
54-
55-
print type(r.content)
50+
print(type(r.content))
5651
text = r.content.decode('utf8', 'ignore')
5752
# text = r.text.decode('utf8', 'ignore')
5853
status = ERROR_NONE
@@ -95,8 +90,6 @@ def post(self):
9590
self.__log(logreq)
9691

9792

98-
99-
10093
application = tornado.web.Application([
10194
(r"/api/1.0/ru/measure/", RusMeasureHandler),
10295
])

api/settings.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from pathlib import Path
2+
3+
import environ
4+
5+
env = environ.Env(
6+
MONGO_HOST=(str, "localhost"),
7+
MONGO_PORT=(int, 27017),
8+
TEXTSBYGRADE_FOLDER=(str, "api/textmetric/textsbygrade"),
9+
)
10+
MONGO_HOST = env("MONGO_HOST")
11+
MONGO_PORT = env("MONGO_PORT")
12+
TEXTSBYGRADE_FOLDER = env("TEXTSBYGRADE_FOLDER")
13+
14+
BASE_DIR = Path(__file__).resolve().parent.parent

api/textmetric/metric.py

+111-92
Large diffs are not rendered by default.

api/tornading.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
#!/usr/bin/env python
2-
# -*- coding: utf-8 -*-
3-
41
import os
52
import sys
63
import signal
@@ -54,7 +51,7 @@ def daemon():
5451
import tornado.ioloop
5552
import tornado.wsgi
5653

57-
#~ # настраиваем Джанго
54+
#~ # настраиваем Торнадо
5855
from readabilityio import application
5956
#
6057
# container = tornado.wsgi.WSGIContainer(application)
@@ -68,18 +65,18 @@ def start():
6865
if not started:
6966
pid = Popen([HOST, os.path.abspath(__file__), 'daemon'],
7067
executable='python').pid
71-
print 'Server started at port %s (pid: %i)...' % (PORT, pid)
68+
print('Server started at port %s (pid: %i)...' % (PORT, pid))
7269
else:
73-
print 'Server alegry started (pid: %i)' % started
70+
print('Server alegry started (pid: %i)' % started)
7471

7572

7673
def stop():
7774
started = alegry_started()
7875
if started:
7976
os.kill(started, signal.SIGKILL)
80-
print 'Server stoped (pid %i)' % started
77+
print('Server stoped (pid %i)' % started)
8178
else:
82-
print 'Server not started'
79+
print('Server not started')
8380

8481

8582
def restart():
@@ -114,5 +111,5 @@ def alegry_started():
114111
cmd = sys.argv[1]
115112
globals()[cmd]()
116113
else:
117-
print 'Error: invalid command'
118-
print 'Usage: python tornader.py {%s}.' % '|'.join(COMMANDS)
114+
print('Error: invalid command')
115+
print('Usage: python tornader.py {%s}.' % '|'.join(COMMANDS))

poetry.lock

+478
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[tool.poetry]
2+
name = "plainrussian"
3+
version = "0.1.1"
4+
description = "Applying Anglo-Saxon algos to estimate the readability of Russian texts."
5+
authors = ["Your Name <[email protected]>"]
6+
license = "CC0 1.0 Universal"
7+
readme = "README.md"
8+
9+
[tool.poetry.dependencies]
10+
python = ">=3.9,<4.0"
11+
django-environ = "^0.10.0"
12+
html2text = "^2020.1.16"
13+
pymongo = "^4.4.0"
14+
readability-lxml = "^0.8.1"
15+
requests = "^2.31.0"
16+
tornado = "^6.3.2"
17+
numpy = "^1.25.0"
18+
19+
[build-system]
20+
requires = ["poetry-core"]
21+
build-backend = "poetry.core.masonry.api"

textmetric/metric.py

+59-51
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
#!/usr/bin/env python
2-
# -*- coding: utf-8 -*-
2+
33
__author__ = 'ibegtin'
4-
from math import sqrt
54
import csv
6-
5+
from math import sqrt
6+
import os
77

88
from numpy import mean, arange
99

10+
from settings import BASE_DIR, TEXTSBYGRADE_FOLDER
11+
12+
TEXTSBYGRADE_PATH = os.path.join(BASE_DIR, TEXTSBYGRADE_FOLDER)
13+
1014

1115
# Russian sounds and characters
1216
RU_CONSONANTS_LOW = [u'к', u'п', u'с', u'т', u'ф', u'х', u'ц', u'ч', u'ш', u'щ']
@@ -342,55 +346,59 @@ def calc_readability_metrics(text, verbose=True):
342346

343347
def print_metrics(filename, verbose=True):
344348
"""Расчет метрик"""
345-
metrics = calc_text_metrics(filename, verbose)
349+
metrics = calc_text_metrics(filename, verbose)["metrics"]
346350

347-
print u"""
348-
Файл - %s
349-
""" % (filename, )
351+
print(f"(Файл - {filename})")
350352
if verbose:
351-
print u"""
352-
- Символов: %d
353-
- Букв: %d
354-
- Пробелов: %d
355-
- Слов: %d
356-
- Сложных слов: %d
357-
- Слогов: %d
358-
- Предложений: %d
359-
- Доля сложных слов: %f
360-
- Среднее число слогов на слово: %f
361-
- Среднее число слов на предложение: %f
362-
""" %(metrics['chars'], metrics['letters'], metrics['spaces'], metrics['n_words'], metrics['n_complex_words'], metrics['n_syllabes'], metrics['n_sentences'], metrics['c_share'], metrics['avg_syl'], metrics['avg_slen'])#, unfam_words, unf_share)
363-
print '- SMOG: %f' %(calc_SMOG(metrics['n_complex_words'], metrics['n_sentences']))
364-
print '- Gunning fog: %f' %(calc_Gunning_fog(metrics['n_complex_words'], metrics['n_words'], metrics['n_sentences']))
365-
print '- Dale-Chale: %f' %(calc_Dale_Chale_index(metrics['n_complex_words'], metrics['n_words'], metrics['n_sentences']))
366-
print '- Flesh Kincaid: %f' %(calc_Flesh_Kincaid(metrics['n_syllabes'], metrics['n_words'], metrics['n_sentences']))
367-
# print '- Flesh Kincaid (rus): %f' %(calc_Flesh_Kincaid_rus(metrics['n_syllabes'], metrics['n_words'], metrics['n_sentences']))
353+
print(
354+
f"- Символов: {metrics['chars']}\n"
355+
f"- Букв: {metrics['letters']}\n"
356+
f"- Пробелов: {metrics['spaces']}\n"
357+
f"- Слов: {metrics['n_words']}\n"
358+
f"- Сложных слов: {metrics['n_complex_words']}\n"
359+
f"- Слогов: {metrics['n_syllabes']}\n"
360+
f"- Предложений: {metrics['n_sentences']}\n"
361+
f"- Доля сложных слов: {metrics['c_share']}\n"
362+
f"- Слов: {metrics['n_words']}\n"
363+
f"- Среднее число слогов на слово: {metrics['avg_syl']}\n"
364+
f"- Среднее число слов на предложение: {metrics['avg_slen']}\n"
365+
)
366+
print('- SMOG: %f' % (calc_SMOG(metrics['n_complex_words'], metrics['n_sentences'])))
367+
print('- Gunning fog: %f' % (
368+
calc_Gunning_fog(metrics['n_complex_words'], metrics['n_words'], metrics['n_sentences'])))
369+
print(
370+
'- Dale-Chale: %f' % (calc_Dale_Chale(metrics['n_complex_words'], metrics['n_words'], metrics['n_sentences'])))
371+
print(
372+
'- Flesh Kincaid: %f' % (calc_Flesh_Kincaid(metrics['n_syllabes'], metrics['n_words'], metrics['n_sentences'])))
373+
# print('- Flesh Kincaid (rus): %f' %(calc_Flesh_Kincaid_rus(metrics['n_syllabes'], metrics['n_words'], metrics['n_sentences'])))
368374
grade = calc_Flesh_Kincaid_Grade_rus(metrics['n_syllabes'], metrics['n_words'], metrics['n_sentences'])
369375
abs_grade = round(grade)
370-
print '- Flesh Kincaid Grade (rus): %f' %(grade)
376+
print('- Flesh Kincaid Grade (rus): %f' % (grade))
371377
if abs_grade in GRADE_TEXT:
372-
text = GRADE_TEXT[abs_grade]
378+
text = GRADE_TEXT[abs_grade]
373379
elif abs_grade > 17:
374380
text = POST_GRADE_TEXT_18_24
375381
else:
376382
text = u'неизвестно (%d)' % (grade)
377-
print '- Grade level: %s' % text
378-
383+
print('- Grade level: %s' % text)
379384

380385

381386
def generate_all_metrics(outfile="metrics.csv"):
382387
f = open(outfile, 'w')
383-
fieldnames = ['filename', 'name', 'grade', 'index_fk_rus', 'fk_grade_diff', 'index_cl_rus', 'cl_grade_diff', 'index_dc_rus', 'dc_grade_diff', 'index_SMOG_rus', 'SMOG_grade_diff', 'index_ari_rus', 'ari_grade_diff', 'chars', 'spaces', 'letters', 'n_syllabes', 'n_words', 'n_complex_words', 'n_simple_words', 'n_sentences', 'c_share', 'avg_syl', 'avg_slen', 'wsyllabes']
388+
fieldnames = ['filename', 'name', 'grade', 'index_fk_rus', 'fk_grade_diff', 'index_cl_rus', 'cl_grade_diff',
389+
'index_dc_rus', 'dc_grade_diff', 'index_SMOG_rus', 'SMOG_grade_diff', 'index_ari_rus',
390+
'ari_grade_diff', 'chars', 'spaces', 'letters', 'n_syllabes', 'n_words', 'n_complex_words',
391+
'n_simple_words', 'n_sentences', 'c_share', 'avg_syl', 'avg_slen', 'wsyllabes']
384392
writer = csv.DictWriter(f, fieldnames)
385393
writer.writeheader()
386394
diffs = []
387-
avg_diff = 0
388395
for text in TEXT_LIST:
389-
metrics = calc_text_metrics('textsbygrade/%d/%s' %(text[1], text[0]))
390-
print text[0]
396+
metrics = calc_text_metrics(os.path.join(TEXTSBYGRADE_PATH, f"{text[1]}", f"{text[0]}"))
397+
print(text[0])
391398
for k, v in metrics['wsyllabes'].items():
392-
print "- %s: %d of %d (%f)" %(k, v, metrics['n_words'], float(v) * 100.0 / metrics['n_words'])
393-
print '- simple words: %d (%f%%)' % (metrics['n_simple_words'], float(metrics['n_simple_words']) * 100.0 / metrics['n_words'])
399+
print("- %s: %d of %d (%f)" % (k, v, metrics['n_words'], float(v) * 100.0 / metrics['n_words']))
400+
print('- simple words: %d (%f%%)' % (
401+
metrics['n_simple_words'], float(metrics['n_simple_words']) * 100.0 / metrics['n_words']))
394402

395403
metrics['name'] = text[3]
396404
metrics['filename'] = text[0]
@@ -425,30 +433,29 @@ def generate_all_metrics(outfile="metrics.csv"):
425433
grade_diff = metrics['grade'] - metrics['index_ari_rus']
426434
metrics['ari_grade_diff'] = grade_diff
427435

428-
429436
diffs.append(grade_diff)
430437
for k in metrics.keys():
431438
metrics[k] = metrics[k].encode('utf8') if type(metrics[k]) == type(u'') else str(metrics[k])
432439
writer.writerow(metrics)
433440
avg_diff = mean(diffs)
434441
diffs.sort()
435-
print diffs
436-
print avg_diff
442+
print(diffs)
443+
print(avg_diff)
437444
f.close()
438445

446+
439447
def print_all_metrics():
440448
for text in TEXT_LIST:
441-
print "#", text[3].encode('utf8')
442-
print_metrics('textsbygrade/%d/%s' %(text[1], text[0]))
443-
print "----"
444-
449+
print("#", text[3].encode('utf8'))
450+
print_metrics(os.path.join(TEXTSBYGRADE_PATH, f"{text[1]}", f"{text[0]}"))
451+
print("----")
445452

446453

447454
def load_metrics():
448455
allmetrics = []
449456
for text in TEXT_LIST:
450457
# if text[1] > 16: continue
451-
metrics = calc_text_metrics('textsbygrade/%d/%s' %(text[1], text[0]))
458+
metrics = calc_text_metrics(os.path.join(TEXTSBYGRADE_PATH, f"{text[1]}", f"{text[0]}"))
452459
metrics['name'] = text[3]
453460
metrics['filename'] = text[0]
454461
metrics['grade'] = text[1]
@@ -499,13 +506,13 @@ def adapt_algorithm_2r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
499506
total = 1
500507
for r in ranges:
501508
total *= (r[1] - r[0]) / r[2]
502-
print r
509+
print(r)
503510

504511
for r1 in arange(*ranges[0]):
505512
for r2 in arange(*ranges[1]):
506513
n += 1
507514
if n % 1000 == 0:
508-
print 'Processing %d of %d' % (n, total), 'values', r1, r2
515+
print('Processing %d of %d' % (n, total), 'values', r1, r2)
509516
diffs = calc_diff(allmetrics, func, keys, [r1, r2])
510517
# print diffs
511518

@@ -516,13 +523,13 @@ def adapt_algorithm_2r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
516523
if avg_hybrid < best_diff[0]:
517524
best_diff = [avg_hybrid, avg_mean, avg_max]
518525
best_mark = [r1, r2]
519-
print 'Best - x: %f, y: %f with hybrid %f, mean %f and max %f' %(r1, r2, avg_hybrid, avg_mean, avg_max)
526+
print('Best - x: %f, y: %f with hybrid %f, mean %f and max %f' %(r1, r2, avg_hybrid, avg_mean, avg_max))
520527
best_alldiffs = diffs
521528
else:
522529
best_diff = [avg_hybrid, avg_mean, avg_max]
523530
best_mark = [r1, r2]
524531
best_alldiffs = diffs
525-
print 'Best - x: %f, y: %f with value hybrid %f, max %f, mean %f ' %(best_mark[0], best_mark[1], best_diff[0], best_diff[1], best_diff[2])
532+
print('Best - x: %f, y: %f with value hybrid %f, max %f, mean %f ' %(best_mark[0], best_mark[1], best_diff[0], best_diff[1], best_diff[2]))
526533

527534
def adapt_algorithm_3r(func, keys=[], ranges=[], expected_max=3.0, expected_mean=1.1):
528535
best_diff = [-1, -1, -1]
@@ -535,14 +542,14 @@ def adapt_algorithm_3r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
535542
total = 1
536543
for r in ranges:
537544
total *= (r[1] - r[0]) / r[2]
538-
print r
545+
print(r)
539546

540547
for r1 in arange(*ranges[0]):
541548
for r2 in arange(*ranges[1]):
542549
for r3 in arange(*ranges[2]):
543550
n += 1
544551
if n % 1000 == 0:
545-
print 'Processing %d of %d' % (n, total), 'values', r1, r2, r3
552+
print('Processing %d of %d' % (n, total), 'values', r1, r2, r3)
546553
diffs = calc_diff(allmetrics, func, keys, [r1, r2, r3])
547554
if len(diffs) == 0: continue
548555
avg_mean = mean(diffs)
@@ -552,14 +559,15 @@ def adapt_algorithm_3r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
552559
if avg_hybrid < best_diff[0]:
553560
best_diff = [avg_hybrid, avg_mean, avg_max]
554561
best_mark = [r1, r2, r3]
555-
print 'Best - x: %f, y: %f, z: %f with hybrid %f, mean %f and max %f' %(r1, r2, r3, avg_hybrid, avg_mean, avg_max)
556-
# print 'Diffs', diffs
562+
print('Best - x: %f, y: %f, z: %f with hybrid %f, mean %f and max %f' % (
563+
r1, r2, r3, avg_hybrid, avg_mean, avg_max))
564+
# print 'Diffs', diffs
557565
best_alldiffs = diffs
558566
else:
559567
best_diff = [avg_hybrid, avg_mean, avg_max]
560568
best_mark = [r1, r2, r3]
561569
best_alldiffs = diffs
562-
print 'Best - x: %f, y: %f, z: %f with value hybrid %f, mean %f, max %f ' %(best_mark[0], best_mark[1], best_mark[2], best_diff[0], best_diff[1], best_diff[2])
570+
print('Best - x: %f, y: %f, z: %f with value hybrid %f, mean %f, max %f ' %(best_mark[0], best_mark[1], best_mark[2], best_diff[0], best_diff[1], best_diff[2]))
563571

564572

565573
if __name__ == "__main__":

0 commit comments

Comments
 (0)