1
1
#!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
2
+
3
3
__author__ = 'ibegtin'
4
- from math import sqrt
5
4
import csv
6
-
5
+ from math import sqrt
6
+ import os
7
7
8
8
from numpy import mean , arange
9
9
10
+ from settings import BASE_DIR , TEXTSBYGRADE_FOLDER
11
+
12
+ TEXTSBYGRADE_PATH = os .path .join (BASE_DIR , TEXTSBYGRADE_FOLDER )
13
+
10
14
11
15
# Russian sounds and characters
12
16
RU_CONSONANTS_LOW = [u'к' , u'п' , u'с' , u'т' , u'ф' , u'х' , u'ц' , u'ч' , u'ш' , u'щ' ]
@@ -342,55 +346,59 @@ def calc_readability_metrics(text, verbose=True):
342
346
343
347
def print_metrics (filename , verbose = True ):
344
348
"""Расчет метрик"""
345
- metrics = calc_text_metrics (filename , verbose )
349
+ metrics = calc_text_metrics (filename , verbose )[ "metrics" ]
346
350
347
- print u"""
348
- Файл - %s
349
- """ % (filename , )
351
+ print (f"(Файл - { filename } )" )
350
352
if verbose :
351
- print u"""
352
- - Символов: %d
353
- - Букв: %d
354
- - Пробелов: %d
355
- - Слов: %d
356
- - Сложных слов: %d
357
- - Слогов: %d
358
- - Предложений: %d
359
- - Доля сложных слов: %f
360
- - Среднее число слогов на слово: %f
361
- - Среднее число слов на предложение: %f
362
- """ % (metrics ['chars' ], metrics ['letters' ], metrics ['spaces' ], metrics ['n_words' ], metrics ['n_complex_words' ], metrics ['n_syllabes' ], metrics ['n_sentences' ], metrics ['c_share' ], metrics ['avg_syl' ], metrics ['avg_slen' ])#, unfam_words, unf_share)
363
- print '- SMOG: %f' % (calc_SMOG (metrics ['n_complex_words' ], metrics ['n_sentences' ]))
364
- print '- Gunning fog: %f' % (calc_Gunning_fog (metrics ['n_complex_words' ], metrics ['n_words' ], metrics ['n_sentences' ]))
365
- print '- Dale-Chale: %f' % (calc_Dale_Chale_index (metrics ['n_complex_words' ], metrics ['n_words' ], metrics ['n_sentences' ]))
366
- print '- Flesh Kincaid: %f' % (calc_Flesh_Kincaid (metrics ['n_syllabes' ], metrics ['n_words' ], metrics ['n_sentences' ]))
367
- # print '- Flesh Kincaid (rus): %f' %(calc_Flesh_Kincaid_rus(metrics['n_syllabes'], metrics['n_words'], metrics['n_sentences']))
353
+ print (
354
+ f"- Символов: { metrics ['chars' ]} \n "
355
+ f"- Букв: { metrics ['letters' ]} \n "
356
+ f"- Пробелов: { metrics ['spaces' ]} \n "
357
+ f"- Слов: { metrics ['n_words' ]} \n "
358
+ f"- Сложных слов: { metrics ['n_complex_words' ]} \n "
359
+ f"- Слогов: { metrics ['n_syllabes' ]} \n "
360
+ f"- Предложений: { metrics ['n_sentences' ]} \n "
361
+ f"- Доля сложных слов: { metrics ['c_share' ]} \n "
362
+ f"- Слов: { metrics ['n_words' ]} \n "
363
+ f"- Среднее число слогов на слово: { metrics ['avg_syl' ]} \n "
364
+ f"- Среднее число слов на предложение: { metrics ['avg_slen' ]} \n "
365
+ )
366
+ print ('- SMOG: %f' % (calc_SMOG (metrics ['n_complex_words' ], metrics ['n_sentences' ])))
367
+ print ('- Gunning fog: %f' % (
368
+ calc_Gunning_fog (metrics ['n_complex_words' ], metrics ['n_words' ], metrics ['n_sentences' ])))
369
+ print (
370
+ '- Dale-Chale: %f' % (calc_Dale_Chale (metrics ['n_complex_words' ], metrics ['n_words' ], metrics ['n_sentences' ])))
371
+ print (
372
+ '- Flesh Kincaid: %f' % (calc_Flesh_Kincaid (metrics ['n_syllabes' ], metrics ['n_words' ], metrics ['n_sentences' ])))
373
+ # print('- Flesh Kincaid (rus): %f' %(calc_Flesh_Kincaid_rus(metrics['n_syllabes'], metrics['n_words'], metrics['n_sentences'])))
368
374
grade = calc_Flesh_Kincaid_Grade_rus (metrics ['n_syllabes' ], metrics ['n_words' ], metrics ['n_sentences' ])
369
375
abs_grade = round (grade )
370
- print '- Flesh Kincaid Grade (rus): %f' % (grade )
376
+ print ( '- Flesh Kincaid Grade (rus): %f' % (grade ) )
371
377
if abs_grade in GRADE_TEXT :
372
- text = GRADE_TEXT [abs_grade ]
378
+ text = GRADE_TEXT [abs_grade ]
373
379
elif abs_grade > 17 :
374
380
text = POST_GRADE_TEXT_18_24
375
381
else :
376
382
text = u'неизвестно (%d)' % (grade )
377
- print '- Grade level: %s' % text
378
-
383
+ print ('- Grade level: %s' % text )
379
384
380
385
381
386
def generate_all_metrics (outfile = "metrics.csv" ):
382
387
f = open (outfile , 'w' )
383
- fieldnames = ['filename' , 'name' , 'grade' , 'index_fk_rus' , 'fk_grade_diff' , 'index_cl_rus' , 'cl_grade_diff' , 'index_dc_rus' , 'dc_grade_diff' , 'index_SMOG_rus' , 'SMOG_grade_diff' , 'index_ari_rus' , 'ari_grade_diff' , 'chars' , 'spaces' , 'letters' , 'n_syllabes' , 'n_words' , 'n_complex_words' , 'n_simple_words' , 'n_sentences' , 'c_share' , 'avg_syl' , 'avg_slen' , 'wsyllabes' ]
388
+ fieldnames = ['filename' , 'name' , 'grade' , 'index_fk_rus' , 'fk_grade_diff' , 'index_cl_rus' , 'cl_grade_diff' ,
389
+ 'index_dc_rus' , 'dc_grade_diff' , 'index_SMOG_rus' , 'SMOG_grade_diff' , 'index_ari_rus' ,
390
+ 'ari_grade_diff' , 'chars' , 'spaces' , 'letters' , 'n_syllabes' , 'n_words' , 'n_complex_words' ,
391
+ 'n_simple_words' , 'n_sentences' , 'c_share' , 'avg_syl' , 'avg_slen' , 'wsyllabes' ]
384
392
writer = csv .DictWriter (f , fieldnames )
385
393
writer .writeheader ()
386
394
diffs = []
387
- avg_diff = 0
388
395
for text in TEXT_LIST :
389
- metrics = calc_text_metrics ('textsbygrade/%d/%s' % ( text [1 ], text [0 ]))
390
- print text [0 ]
396
+ metrics = calc_text_metrics (os . path . join ( TEXTSBYGRADE_PATH , f" { text [1 ]} " , f" { text [0 ]} " ))
397
+ print ( text [0 ])
391
398
for k , v in metrics ['wsyllabes' ].items ():
392
- print "- %s: %d of %d (%f)" % (k , v , metrics ['n_words' ], float (v ) * 100.0 / metrics ['n_words' ])
393
- print '- simple words: %d (%f%%)' % (metrics ['n_simple_words' ], float (metrics ['n_simple_words' ]) * 100.0 / metrics ['n_words' ])
399
+ print ("- %s: %d of %d (%f)" % (k , v , metrics ['n_words' ], float (v ) * 100.0 / metrics ['n_words' ]))
400
+ print ('- simple words: %d (%f%%)' % (
401
+ metrics ['n_simple_words' ], float (metrics ['n_simple_words' ]) * 100.0 / metrics ['n_words' ]))
394
402
395
403
metrics ['name' ] = text [3 ]
396
404
metrics ['filename' ] = text [0 ]
@@ -425,30 +433,29 @@ def generate_all_metrics(outfile="metrics.csv"):
425
433
grade_diff = metrics ['grade' ] - metrics ['index_ari_rus' ]
426
434
metrics ['ari_grade_diff' ] = grade_diff
427
435
428
-
429
436
diffs .append (grade_diff )
430
437
for k in metrics .keys ():
431
438
metrics [k ] = metrics [k ].encode ('utf8' ) if type (metrics [k ]) == type (u'' ) else str (metrics [k ])
432
439
writer .writerow (metrics )
433
440
avg_diff = mean (diffs )
434
441
diffs .sort ()
435
- print diffs
436
- print avg_diff
442
+ print ( diffs )
443
+ print ( avg_diff )
437
444
f .close ()
438
445
446
+
439
447
def print_all_metrics ():
440
448
for text in TEXT_LIST :
441
- print "#" , text [3 ].encode ('utf8' )
442
- print_metrics ('textsbygrade/%d/%s' % (text [1 ], text [0 ]))
443
- print "----"
444
-
449
+ print ("#" , text [3 ].encode ('utf8' ))
450
+ print_metrics (os .path .join (TEXTSBYGRADE_PATH , f"{ text [1 ]} " , f"{ text [0 ]} " ))
451
+ print ("----" )
445
452
446
453
447
454
def load_metrics ():
448
455
allmetrics = []
449
456
for text in TEXT_LIST :
450
457
# if text[1] > 16: continue
451
- metrics = calc_text_metrics ('textsbygrade/%d/%s' % ( text [1 ], text [0 ]))
458
+ metrics = calc_text_metrics (os . path . join ( TEXTSBYGRADE_PATH , f" { text [1 ]} " , f" { text [0 ]} " ))
452
459
metrics ['name' ] = text [3 ]
453
460
metrics ['filename' ] = text [0 ]
454
461
metrics ['grade' ] = text [1 ]
@@ -499,13 +506,13 @@ def adapt_algorithm_2r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
499
506
total = 1
500
507
for r in ranges :
501
508
total *= (r [1 ] - r [0 ]) / r [2 ]
502
- print r
509
+ print ( r )
503
510
504
511
for r1 in arange (* ranges [0 ]):
505
512
for r2 in arange (* ranges [1 ]):
506
513
n += 1
507
514
if n % 1000 == 0 :
508
- print 'Processing %d of %d' % (n , total ), 'values' , r1 , r2
515
+ print ( 'Processing %d of %d' % (n , total ), 'values' , r1 , r2 )
509
516
diffs = calc_diff (allmetrics , func , keys , [r1 , r2 ])
510
517
# print diffs
511
518
@@ -516,13 +523,13 @@ def adapt_algorithm_2r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
516
523
if avg_hybrid < best_diff [0 ]:
517
524
best_diff = [avg_hybrid , avg_mean , avg_max ]
518
525
best_mark = [r1 , r2 ]
519
- print 'Best - x: %f, y: %f with hybrid %f, mean %f and max %f' % (r1 , r2 , avg_hybrid , avg_mean , avg_max )
526
+ print ( 'Best - x: %f, y: %f with hybrid %f, mean %f and max %f' % (r1 , r2 , avg_hybrid , avg_mean , avg_max ) )
520
527
best_alldiffs = diffs
521
528
else :
522
529
best_diff = [avg_hybrid , avg_mean , avg_max ]
523
530
best_mark = [r1 , r2 ]
524
531
best_alldiffs = diffs
525
- print 'Best - x: %f, y: %f with value hybrid %f, max %f, mean %f ' % (best_mark [0 ], best_mark [1 ], best_diff [0 ], best_diff [1 ], best_diff [2 ])
532
+ print ( 'Best - x: %f, y: %f with value hybrid %f, max %f, mean %f ' % (best_mark [0 ], best_mark [1 ], best_diff [0 ], best_diff [1 ], best_diff [2 ]) )
526
533
527
534
def adapt_algorithm_3r (func , keys = [], ranges = [], expected_max = 3.0 , expected_mean = 1.1 ):
528
535
best_diff = [- 1 , - 1 , - 1 ]
@@ -535,14 +542,14 @@ def adapt_algorithm_3r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
535
542
total = 1
536
543
for r in ranges :
537
544
total *= (r [1 ] - r [0 ]) / r [2 ]
538
- print r
545
+ print ( r )
539
546
540
547
for r1 in arange (* ranges [0 ]):
541
548
for r2 in arange (* ranges [1 ]):
542
549
for r3 in arange (* ranges [2 ]):
543
550
n += 1
544
551
if n % 1000 == 0 :
545
- print 'Processing %d of %d' % (n , total ), 'values' , r1 , r2 , r3
552
+ print ( 'Processing %d of %d' % (n , total ), 'values' , r1 , r2 , r3 )
546
553
diffs = calc_diff (allmetrics , func , keys , [r1 , r2 , r3 ])
547
554
if len (diffs ) == 0 : continue
548
555
avg_mean = mean (diffs )
@@ -552,14 +559,15 @@ def adapt_algorithm_3r(func, keys=[], ranges=[], expected_max=3.0, expected_mean
552
559
if avg_hybrid < best_diff [0 ]:
553
560
best_diff = [avg_hybrid , avg_mean , avg_max ]
554
561
best_mark = [r1 , r2 , r3 ]
555
- print 'Best - x: %f, y: %f, z: %f with hybrid %f, mean %f and max %f' % (r1 , r2 , r3 , avg_hybrid , avg_mean , avg_max )
556
- # print 'Diffs', diffs
562
+ print ('Best - x: %f, y: %f, z: %f with hybrid %f, mean %f and max %f' % (
563
+ r1 , r2 , r3 , avg_hybrid , avg_mean , avg_max ))
564
+ # print 'Diffs', diffs
557
565
best_alldiffs = diffs
558
566
else :
559
567
best_diff = [avg_hybrid , avg_mean , avg_max ]
560
568
best_mark = [r1 , r2 , r3 ]
561
569
best_alldiffs = diffs
562
- print 'Best - x: %f, y: %f, z: %f with value hybrid %f, mean %f, max %f ' % (best_mark [0 ], best_mark [1 ], best_mark [2 ], best_diff [0 ], best_diff [1 ], best_diff [2 ])
570
+ print ( 'Best - x: %f, y: %f, z: %f with value hybrid %f, mean %f, max %f ' % (best_mark [0 ], best_mark [1 ], best_mark [2 ], best_diff [0 ], best_diff [1 ], best_diff [2 ]) )
563
571
564
572
565
573
if __name__ == "__main__" :
0 commit comments