-
Notifications
You must be signed in to change notification settings - Fork 0
/
total_sentences_number.py
18 lines (15 loc) · 9.25 KB
/
total_sentences_number.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import numpy as np
import matplotlib.pyplot as plt
file_len=[284810, 198987, 206480, 233680, 203700, 180469, 188280, 165878, 142078, 138578, 116181, 125079, 111298, 121766, 104576, 91638, 92373, 100034, 99248, 82121, 63904, 92774, 87127, 76909, 90947, 82747, 88419, 79408, 91015, 83394, 80722, 75522, 75121, 74411, 86421, 77339, 74187, 74321, 69763, 74700, 65415, 72972, 81427, 72469, 57033, 60014, 68724, 67919, 64317, 65718, 63709, 73884, 63135, 66792, 61698, 52837, 61865, 63933, 46674, 50644, 55269, 49427, 45137, 51423, 50834, 45532, 31895, 54779, 45527, 44512, 47454, 52572, 44584, 36086, 33058, 42384, 38166, 35590, 42483, 47747, 42555, 42156, 44521, 45849, 32758, 37156, 33168, 50780, 38138, 45593, 36492, 20997, 33276, 37743, 37105, 37188, 36661, 32130, 33540, 33637, 32645, 38051, 30502, 25282, 28455, 32708, 34996, 30482, 27842, 29247, 27902, 28811, 25777, 28290, 17760, 24191, 30453, 14734, 23269, 24401, 29004, 30891, 24759, 27397, 25274, 22979, 20312, 28058, 24635, 22753, 25717, 23488, 26475, 24989, 20009, 20073, 18912, 25129, 23304, 19992, 21923, 20981, 20440, 18640, 15302, 24039, 23236, 12967, 19470, 20473, 16439, 22586, 23426, 23458, 20012, 18901, 20905, 18515, 22160, 19986, 23946, 21037, 18675, 17378, 25130, 21162, 23289, 22016, 21233, 20143, 17363, 17801, 15792, 15321, 21229, 22507, 10999, 20198, 17164, 17367, 19266, 20309, 19377, 18691, 11574, 12868, 17877, 18733, 20859, 19377, 19825, 18997, 6839, 16483, 17012, 18111, 17765, 18219, 14810, 17468, 14664, 15906, 17724, 17038, 13414, 15483, 15442, 16707, 16038, 18177, 15463, 12839, 11963, 17012, 15107, 17400, 15999, 14774, 11591, 16155, 12166, 15409, 15625, 16768, 13790, 14314, 13847, 11292, 13985, 17091, 13683, 14599, 16140, 12803, 15047, 12868, 10837, 15120, 11863, 16060, 11120, 13237, 14795, 14824, 11588, 15002, 11726, 8269, 12660, 12399, 11976, 11677, 10700, 9900, 11342, 10880, 13354, 12791, 11584, 13128, 8286, 12475, 11212, 11368, 8733, 9151, 10633, 9121, 13108, 10723, 12064, 11709, 12157, 11599, 12841, 9352, 8998, 10378, 9762, 11738, 9838, 12042, 9745, 11901, 10828, 8652, 6781, 11524, 9061, 9005, 9611, 11132, 6461, 11544, 8719, 10592, 11860, 9906, 9824, 11049, 9238, 10701, 9455, 9726, 11643, 10868, 10299, 9383, 9142, 9104, 10696, 10823, 7744, 8411, 8596, 9254, 9891, 9180, 8721, 10215, 10223, 8548, 9542, 9122, 9254, 9145, 11279, 8754, 6928, 10566, 6213, 10005, 9573, 8177, 8916, 9236, 7627, 8967, 7354, 9357, 9758, 9602, 8622, 8133, 9067, 8985, 8528, 7366, 8722, 8712, 7375, 8011, 8345, 7668, 7466, 7258, 6337, 7603, 7103, 9276, 6897, 8760, 6054, 6835, 7100, 8568, 7429, 7665, 8275, 8336, 9276, 7979, 6826, 8315, 6001, 7708, 7677, 8546, 6615, 6675, 7225, 8220, 6236, 7980, 8156, 6629, 5817, 6391, 6046, 8045, 7526, 6438, 7205, 5947, 6394, 6013, 5786, 6305, 7870, 6620, 6677, 6476, 5846, 6050, 6164, 5670, 5777, 6609, 7019, 7562, 6908, 6969, 6468, 6740, 5558, 5629, 6561, 7335, 6507, 7590, 7595, 5609, 6361, 5574, 6538, 6889, 6595, 5065, 6173, 4772, 6249, 6412, 6907, 6712, 5976, 4937, 6393, 4956, 5112, 6512, 5288, 5023, 6195, 5136, 6766, 6859, 6145, 6370, 5782, 5105, 5460, 5857, 6574, 6297, 4509, 4706, 4519, 5752, 5648, 4642, 6833, 5570, 3400, 3924, 5573, 4747, 5225, 5065, 5653, 5273, 5423, 5137, 5268, 5584, 5321, 5168, 5370, 5604, 5374, 3753, 4994, 5224, 3780, 5373, 5893, 4184, 6180, 5295, 5082, 4590, 5420, 4971, 4847, 4611, 5092, 4141, 4407, 4539, 5046, 5053, 5079, 3671, 4284, 3714, 4399, 4103, 5240, 4509, 4540, 4118, 4714, 5304, 4365, 4374, 4725, 3377, 3809, 4947, 4586, 4453, 4005, 4058, 4592, 5159, 4794, 2936, 4134, 4234, 3665, 4681, 4410, 3890, 4494, 4423, 4818, 4666, 3952, 4656, 3913, 3490, 3874, 3960, 3725, 3412, 3054, 4775, 3730, 4227, 3860, 4520, 3996, 3367, 4635, 3772, 4744, 3626, 4222, 3960, 4707, 3178, 4101, 4220, 3207, 3708, 3665, 3487, 3845, 2764, 4558, 3490, 3899, 4072, 3020, 3556, 3925, 3414, 4547, 4106, 2695, 3767, 2874, 3249, 3764, 3351, 2983, 2636, 3985, 3803, 3452, 3816, 3583, 3602, 4067, 2906, 3256, 3572, 2490, 3631, 2622, 2328, 3320, 2808, 3511, 2087, 1857, 3774, 3171, 3764, 3228, 3525, 1938, 3934, 3720, 3396, 3495, 2661, 2947, 1974, 3917, 3653, 3527, 3106, 2872, 3746, 3680, 3219, 2951, 2969, 2644, 3880, 2841, 2840, 3360, 3758, 2841, 3767, 3098, 3098, 2631, 2627, 2987, 3106, 2258, 2201, 2770, 1968, 2942, 3022, 2533, 2608, 2845, 2957, 3169, 2307, 2438, 2568, 2218, 2909, 2014, 2953, 2759, 2210, 3112, 2328, 2198, 3279, 2286, 2669, 2341, 2821, 2326, 3134, 2946, 2443, 2688, 2636, 1759, 1625, 1807, 2460, 2234, 2876, 3060, 2603, 3002, 2002, 2234, 2585, 2430, 2737, 2785, 2332, 2857, 1716, 2528, 3110, 1635, 2379, 2590, 2558, 2530, 2122, 1978, 2277, 2101, 1719, 2067, 2346, 2563, 2463, 1920, 2716, 1853, 2418, 1976, 2397, 2584, 2240, 1932, 2713, 2112, 1953, 2354, 2607, 2042, 2133, 2178, 1989, 2425, 2293, 2244, 2307, 2014, 1811, 2268, 2241, 1684, 1429, 2215, 2333, 1902, 2121, 2591, 2083, 1803, 2063, 1873, 2026, 2092, 2190, 1497, 1307, 2001, 2476, 1531, 1920, 2019, 2106, 2294, 1486, 1976, 2112, 1851, 2074, 2085, 1364, 1630, 1667, 1718, 1779, 2143, 1886, 1365, 2449, 1832, 2053, 1958, 2067, 1855, 1748, 1957, 1690, 2077, 1495, 1852, 1521, 1900, 1528, 1821, 2110, 1801, 1282, 1723, 2034, 1460, 1535, 1606, 1461, 1115, 1922, 1978, 1679, 1720, 2171, 1801, 1639, 1715, 1651, 1577, 1704, 1658, 1863, 1841, 1912, 2034, 760, 1142, 1496, 1732, 1634, 1795, 1334, 1521, 1547, 971, 1951, 1277, 1061, 1732, 1692, 1499, 1080, 1304, 1756, 1753, 1055, 1592, 1357, 1674, 1766, 1529, 1282, 1619, 1665, 1372, 1580, 1252, 1609, 1576, 1440, 1504, 1285, 1269, 885, 950, 1238, 1442, 902, 1516, 1506, 1406, 1554, 1558, 1542, 1281, 1636, 1476, 1119, 1451, 1276, 1305, 1532, 1117, 1134, 1122, 1131, 1428, 1168, 1310, 998, 1226, 1153, 1100, 1257, 1330, 1380, 1094, 1138, 1159, 1228, 1402, 1661, 1318, 1459, 1281, 1184, 1194, 913, 1421, 1542, 916, 1123, 921, 1458, 875, 1393, 1291, 976, 1486, 883, 1352, 1432, 1308, 1316, 1039, 662, 1046, 1264, 1098, 1134, 1147, 1010, 949, 1110, 995, 888, 576, 1025, 1126, 1152, 987, 813, 895, 872, 890, 780, 1165, 1195, 1111, 1144, 1122, 1115, 1160, 992, 1086, 1009, 980, 866, 951, 562, 962, 1086, 1087, 1016, 985, 993, 1154, 1103, 815, 774, 696, 1055, 1017, 984, 969, 968, 838, 827, 980, 1052, 1094, 680, 1330, 933, 721, 673, 731, 1182, 917, 1067, 942, 629, 688, 1094, 921, 723, 860, 1108, 748, 938, 953, 912, 885, 916, 805, 749, 985, 1004, 628, 696, 728, 715, 1022, 884, 965, 792, 857, 760, 968, 767, 650, 896, 857, 707, 804, 705, 944, 559, 742, 529, 843, 779, 683, 795, 703, 622, 827, 560, 864, 781, 671, 864, 883, 798, 577, 935, 653, 729, 678, 834, 782, 440, 897, 665, 724, 693, 681, 847, 512, 682, 576, 744, 551, 672, 815, 674, 722, 753, 690, 467, 689, 701, 646, 776, 581, 659, 696, 653, 515, 656, 751, 604, 716, 745, 709, 572, 553, 803, 597, 713, 711, 814, 649, 494, 510, 736, 665, 748, 698, 447, 496, 482, 508, 560, 644, 573, 549, 709, 601, 432, 565, 651, 556, 346, 310, 435, 504, 439, 469, 542, 579, 344, 287, 444, 434, 522, 655, 378, 485, 535, 543, 619, 474, 361, 559, 497, 427, 332, 412, 457, 561, 574, 361, 564, 306, 322, 410, 516, 425, 350, 542, 581, 493, 299, 457, 465, 456, 413, 428, 325, 470, 498, 462, 337, 445, 428, 509, 405, 379, 425, 458, 413, 514, 411, 494, 298, 499, 453, 472, 331, 176, 349, 391, 389, 457, 323, 494, 340, 393, 345, 396, 328, 318, 402, 259, 393, 366, 438, 411, 383, 238, 318, 273, 339, 351, 288, 323, 396, 219, 263, 367, 392, 341, 359, 270, 352, 358, 291, 330, 334, 175, 319, 282, 290, 291, 246, 363, 331, 276, 376, 377, 261, 313, 285, 281, 336, 338, 300, 289, 349, 173, 276, 275, 311, 330, 392, 315, 307, 320, 184, 358, 159, 325, 259, 299, 320, 334, 210, 260, 315, 210, 258, 177, 319, 275, 286, 251, 202, 257, 228, 273, 282, 252, 220, 218, 298, 265, 281, 272, 242, 241, 179, 268, 202, 282, 297, 203, 217, 272, 242, 231, 253, 281, 246, 270, 264, 261, 283, 235, 282, 242, 210, 232, 261, 193, 240, 256, 143, 219, 241, 199, 274, 178, 222, 279, 255, 183, 208, 224, 203, 240, 236, 234, 214, 255, 206, 154, 218, 129, 176, 145, 239, 205, 189, 195, 137, 203, 175, 127, 129, 113, 186, 100, 155, 213, 194, 171, 188, 188, 191, 215, 161, 185, 166, 175, 127, 171, 159, 170, 186, 139, 153, 166, 188, 126, 135, 176, 150, 167, 178, 177, 128, 120, 160, 121, 152, 178, 172, 179, 97, 87, 184, 107, 125, 154, 130, 78, 166, 155, 171, 152, 163, 130, 117, 119, 157, 134, 138, 133, 135, 127, 129, 74, 165, 165, 135, 81, 158, 105, 117, 147, 137, 163, 110, 110, 131, 101, 98, 105, 113, 100, 132, 125, 124, 116, 131, 107, 128, 91, 88, 119, 109, 117, 133, 89, 103, 115, 99, 104, 90, 76, 120, 117, 66, 115, 111, 134, 97, 112, 104, 95, 100, 90, 93, 120, 96, 85, 88, 99, 99, 106, 79, 119, 99, 111, 109, 82, 107, 91, 110, 74, 62, 81, 84, 86, 81, 100, 102, 99, 99, 65, 98, 85, 74, 77, 89, 73, 92, 87, 71, 49, 84, 94, 63, 88, 64, 65, 83, 66, 57, 72, 55, 70, 62, 82, 69, 56, 70, 46, 27, 78, 59, 52, 44, 41, 63, 55, 60, 48, 63, 42, 58, 63, 50, 42, 53, 61, 69, 47, 40, 31, 50, 45, 52, 44, 27, 35, 51, 44, 36, 40, 39, 24, 41, 43, 25, 32, 42, 43, 23, 19, 34, 17, 32, 34, 33, 13, 33, 27, 25, 32, 22, 27, 22, 24, 13, 19, 17, 19, 16, 18, 21, 21, 15, 18, 12, 13, 14, 21, 19, 18, 12, 14, 12, 12, 17, 17, 9, 13, 10, 11, 17, 10, 10, 9, 12, 13, 6, 12, 11, 13, 5, 6, 9, 11, 11, 9, 8, 11, 7, 6, 7, 6, 7, 7, 7, 7, 6, 3, 4, 3, 5, 4, 5, 2, 3, 2, 2, 2, 2, 2, 1, 1, 1]
total_sentences = sum(file_len)
print(total_sentences)
file_len = np.array(file_len)
pos = np.where((file_len>10000))[0]
print(pos, len(pos))
"""
x = np.linspace(0, len(file_len), len(file_len))
y = file_len
fig, ax = plt.subplots()
ax.plot(x, y, 'ro')
plt.show()
"""