diff --git a/pylintrc b/pylintrc index deeb5a8fb..439653de1 100644 --- a/pylintrc +++ b/pylintrc @@ -24,23 +24,23 @@ extension-pkg-whitelist= [MESSAGES CONTROL] disable= - # C103, C114, C115, C116 + # C0103, C0114, C0115, C0116 invalid-name, missing-module-docstring, missing-class-docstring, missing-function-docstring, - # C301, C302 + # C0301, C0302 line-too-long, too-many-lines, - # C413, C415 + # C0413, C0415 wrong-import-position, import-outside-toplevel, - # R401 + # R0401 cyclic-import, - # R801 + # R0801 duplicate-code, - # R901, R902, R903, R904, R912, R913, R914, R915, R916 + # R0901, R0902, R0903, R0904, R0912, R0913, R0914, R0915, R0916, R0917 too-many-ancestors, too-many-instance-attributes, too-few-public-methods, @@ -50,21 +50,22 @@ disable= too-many-locals, too-many-statements, too-many-branches, - # R1702, R1705, R1723, R1724 + too-many-positional, + # R1702, R1705, R1720, R1723, R1724 too-many-nested-blocks, no-else-return, + no-else-raise, no-else-break, no-else-continue, - # W201, W212 + # W0201, W0212 attribute-defined-outside-init, protected-access, - # W511 - fixme, - # W621 + # W0603, W0621 + global-statement, redefined-outer-name, - # E401 + # E0401 import-error, - # E1121 - too-many-function-args + # E0606 + possibly-used-before-assignment, diff --git a/tests/files/file_area/misc/[eng_gb] Tagged.txt b/tests/files/file_area/misc/[eng_gb] Tagged.txt new file mode 100644 index 000000000..e481a619f --- /dev/null +++ b/tests/files/file_area/misc/[eng_gb] Tagged.txt @@ -0,0 +1,87 @@ +CHAPTER_NN I_CD +There_EX was_VBD no_DT possibility_NN of_IN taking_VBG a_DT walk_NN that_DT day_NN ._. We_PRP had_VBD been_VBN wandering_VBG ,_, indeed_RB ,_, in_IN the_DT leafless_JJ shrubbery_NN an_DT hour_NN in_IN the_DT morning_NN ;_: but_CC since_IN dinner_NN (_-LRB- Mrs._NNP Reed_NNP ,_, when_WRB there_EX was_VBD no_DT company_NN ,_, dined_VBD early_RB )_-RRB- the_DT cold_JJ winter_NN wind_NN had_VBD brought_VBN with_IN it_PRP clouds_NNS so_RB sombre_JJ ,_, and_CC a_DT rain_NN so_RB penetrating_JJ ,_, that_IN further_JJ outdoor_JJ exercise_NN was_VBD now_RB out_IN of_IN the_DT question_NN ._. + +I_PRP was_VBD glad_JJ of_IN it_PRP :_: I_PRP never_RB liked_VBD long_JJ walks_NNS ,_, especially_RB on_IN chilly_JJ afternoons_NNS :_: dreadful_JJ to_IN me_PRP was_VBD the_DT coming_VBG home_NN in_IN the_DT raw_JJ twilight_NN ,_, with_IN nipped_VBN fingers_NNS and_CC toes_NNS ,_, and_CC a_DT heart_NN saddened_VBN by_IN the_DT chidings_NNS of_IN Bessie_NNP ,_, the_DT nurse_NN ,_, and_CC humbled_VBN by_IN the_DT consciousness_NN of_IN my_PRP$ physical_JJ inferiority_NN to_IN Eliza_NNP ,_, John_NNP ,_, and_CC Georgiana_NNP Reed_NNP ._. + +The_DT said_VBN Eliza_NNP ,_, John_NNP ,_, and_CC Georgiana_NNP were_VBD now_RB clustered_VBN round_IN their_PRP$ mama_NN in_IN the_DT drawing_NN -_HYPH room_NN :_: she_PRP lay_VBP reclined_VBN on_IN a_DT sofa_NN by_IN the_DT fireside_NN ,_, and_CC with_IN her_PRP$ darlings_NNS about_IN her_PRP (_-LRB- for_IN the_DT time_NN neither_CC quarrelling_VBG nor_CC crying_VBG )_-RRB- looked_VBD perfectly_RB happy_JJ ._. Me_PRP ,_, she_PRP had_VBD dispensed_VBN from_IN joining_VBG the_DT group_NN ;_: saying_VBG ,_, “_`` She_PRP regretted_VBD to_TO be_VB under_IN the_DT necessity_NN of_IN keeping_VBG me_PRP at_IN a_DT distance_NN ;_: but_CC that_IN until_IN she_PRP heard_VBD from_IN Bessie_NNP ,_, and_CC could_MD discover_VB by_IN her_PRP$ own_JJ observation_NN ,_, that_IN I_PRP was_VBD endeavouring_VBG in_IN good_JJ earnest_NN to_TO acquire_VB a_DT more_RBR sociable_JJ and_CC childlike_JJ disposition_NN ,_, a_DT more_RBR attractive_JJ and_CC sprightly_JJ manner_NN —_: something_NN lighter_JJR ,_, franker_JJR ,_, more_RBR natural_JJ ,_, as_IN it_PRP were_VBD —_: she_PRP really_RB must_MD exclude_VB me_PRP from_IN privileges_NNS intended_VBN only_RB for_IN contented_JJ ,_, happy_JJ ,_, little_JJ children_NNS ._. ”_'' + +“_`` What_WP does_VBZ Bessie_NNP say_VB I_PRP have_VBP done_VBN ?_. ”_'' I_PRP asked_VBD ._. + +“_`` Jane_NNP ,_, I_PRP do_VBP n’t_RB like_VB cavillers_NNS or_CC questioners_NNS ;_: besides_RB ,_, there_EX is_VBZ something_NN truly_RB forbidding_JJ in_IN a_DT child_NN taking_VBG up_RP her_PRP$ elders_NNS in_IN that_DT manner_NN ._. Be_VB seated_VBN somewhere_RB ;_: and_CC until_IN you_PRP can_MD speak_VB pleasantly_RB ,_, remain_VB silent_JJ ._. ”_'' + +A_DT breakfast_NN -_HYPH room_NN adjoined_VBD the_DT drawing_NN -_HYPH room_NN ,_, I_PRP slipped_VBD in_RP there_RB ._. It_PRP contained_VBD a_DT bookcase_NN :_: I_PRP soon_RB possessed_VBD myself_PRP of_IN a_DT volume_NN ,_, taking_VBG care_NN that_IN it_PRP should_MD be_VB one_CD stored_VBN with_IN pictures_NNS ._. I_PRP mounted_VBD into_IN the_DT window_NN -_HYPH seat_NN :_: gathering_VBG up_RP my_PRP$ feet_NNS ,_, I_PRP sat_VBD cross_JJ -_JJ legged_JJ ,_, like_IN a_DT Turk_NNP ;_: and_CC ,_, having_VBG drawn_VBN the_DT red_JJ moreen_JJR curtain_NN nearly_RB close_RB ,_, I_PRP was_VBD shrined_VBN in_IN double_JJ retirement_NN ._. + +Folds_NNS of_IN scarlet_JJ drapery_NN shut_VBD in_IN my_PRP$ view_NN to_IN the_DT right_JJ hand_NN ;_: to_IN the_DT left_JJ were_VBD the_DT clear_JJ panes_NNS of_IN glass_NN ,_, protecting_VBG ,_, but_CC not_RB separating_VBG me_PRP from_IN the_DT drear_JJ November_NNP day_NN ._. At_IN intervals_NNS ,_, while_IN turning_VBG over_RP the_DT leaves_NNS of_IN my_PRP$ book_NN ,_, I_PRP studied_VBD the_DT aspect_NN of_IN that_DT winter_NN afternoon_NN ._. Afar_NNP ,_, it_PRP offered_VBD a_DT pale_JJ blank_NN of_IN mist_NN and_CC cloud_NN ;_: near_IN a_DT scene_NN of_IN wet_JJ lawn_NN and_CC storm_NN -_HYPH beat_NN shrub_NN ,_, with_IN ceaseless_JJ rain_NN sweeping_VBG away_RB wildly_RB before_IN a_DT long_JJ and_CC lamentable_JJ blast_NN ._. + +I_PRP returned_VBD to_IN my_PRP$ book_NN —_: Bewick_NNP ’s_POS History_NNP of_IN British_NNP Birds_NNS :_: the_DT letterpress_NN thereof_RB I_PRP cared_VBD little_RB for_IN ,_, generally_RB speaking_VBG ;_: and_CC yet_RB there_EX were_VBD certain_JJ introductory_JJ pages_NNS that_WDT ,_, child_NN as_IN I_PRP was_VBD ,_, I_PRP could_MD not_RB pass_VB quite_RB as_IN a_DT blank_NN ._. They_PRP were_VBD those_DT which_WDT treat_VBP of_IN the_DT haunts_NNS of_IN sea_NN -_HYPH fowl_NN ;_: of_IN “_`` the_DT solitary_JJ rocks_NNS and_CC promontories_NNS ”_'' by_IN them_PRP only_RB inhabited_VBN ;_: of_IN the_DT coast_NN of_IN Norway_NNP ,_, studded_VBN with_IN isles_NNS from_IN its_PRP$ southern_JJ extremity_NN ,_, the_DT Lindeness_NNP ,_, or_CC Naze_NNP ,_, to_IN the_DT North_NNP Cape_NNP —_: + +“_`` Where_WRB the_DT Northern_NNP Ocean_NNP ,_, in_IN vast_JJ whirls_NNS ,_, +Boils_NNS round_IN the_DT naked_JJ ,_, melancholy_JJ isles_NNS +Of_IN farthest_JJS Thule_NNP ;_: and_CC the_DT Atlantic_NNP surge_NN +Pours_VBZ in_RP among_IN the_DT stormy_JJ Hebrides_NNPS ._. ”_'' + +Nor_CC could_MD I_PRP pass_VB unnoticed_JJ the_DT suggestion_NN of_IN the_DT bleak_JJ shores_NNS of_IN Lapland_NNP ,_, Siberia_NNP ,_, Spitzbergen_NNP ,_, Nova_NNP Zembla_NNP ,_, Iceland_NNP ,_, Greenland_NNP ,_, with_IN “_`` the_DT vast_JJ sweep_NN of_IN the_DT Arctic_NNP Zone_NNP ,_, and_CC those_DT forlorn_JJ regions_NNS of_IN dreary_JJ space,—that_NN reservoir_NN of_IN frost_NN and_CC snow_NN ,_, where_WRB firm_JJ fields_NNS of_IN ice_NN ,_, the_DT accumulation_NN of_IN centuries_NNS of_IN winters_NNS ,_, glazed_VBD in_IN Alpine_JJ heights_NNS above_IN heights_NNS ,_, surround_VBP the_DT pole_NN ,_, and_CC concentre_VBP the_DT multiplied_JJ rigours_NNS of_IN extreme_JJ cold_NN ._. ”_'' Of_IN these_DT death_NN -_HYPH white_JJ realms_NNS I_PRP formed_VBD an_DT idea_NN of_IN my_PRP$ own_JJ :_: shadowy_JJ ,_, like_IN all_DT the_DT half_RB -_HYPH comprehended_JJ notions_NNS that_WDT float_VBP dim_JJ through_IN children_NNS ’s_POS brains_NNS ,_, but_CC strangely_RB impressive_JJ ._. The_DT words_NNS in_IN these_DT introductory_JJ pages_NNS connected_VBD themselves_PRP with_IN the_DT succeeding_VBG vignettes_NNS ,_, and_CC gave_VBD significance_NN to_IN the_DT rock_NN standing_VBG up_RP alone_RB in_IN a_DT sea_NN of_IN billow_NN and_CC spray_NN ;_: to_IN the_DT broken_JJ boat_NN stranded_VBN on_IN a_DT desolate_JJ coast_NN ;_: to_IN the_DT cold_JJ and_CC ghastly_JJ moon_NN glancing_VBG through_IN bars_NNS of_IN cloud_NN at_IN a_DT wreck_NN just_RB sinking_VBG ._. + +I_PRP can_MD not_RB tell_VB what_WDT sentiment_NN haunted_VBD the_DT quite_RB solitary_JJ churchyard_NN ,_, with_IN its_PRP$ inscribed_VBN headstone_NN ;_: its_PRP$ gate_NN ,_, its_PRP$ two_CD trees_NNS ,_, its_PRP$ low_JJ horizon_NN ,_, girdled_VBN by_IN a_DT broken_JJ wall_NN ,_, and_CC its_PRP$ newly_RB -_HYPH risen_VBN crescent_NN ,_, attesting_VBG the_DT hour_NN of_IN eventide_NNP ._. + +The_DT two_CD ships_NNS becalmed_VBN on_IN a_DT torpid_JJ sea_NN ,_, I_PRP believed_VBD to_TO be_VB marine_JJ phantoms_NNS ._. + +The_DT fiend_NN pinning_VBG down_RP the_DT thief_NN ’s_POS pack_NN behind_IN him_PRP ,_, I_PRP passed_VBD over_RP quickly_RB :_: it_PRP was_VBD an_DT object_NN of_IN terror_NN ._. + +So_RB was_VBD the_DT black_JJ horned_JJ thing_NN seated_VBN aloof_RB on_IN a_DT rock_NN ,_, surveying_VBG a_DT distant_JJ crowd_NN surrounding_VBG a_DT gallows_NN ._. + +Each_DT picture_NN told_VBD a_DT story_NN ;_: mysterious_JJ often_RB to_IN my_PRP$ undeveloped_JJ understanding_NN and_CC imperfect_JJ feelings_NNS ,_, yet_CC ever_RB profoundly_RB interesting_JJ :_: as_RB interesting_JJ as_IN the_DT tales_NNS Bessie_NNP sometimes_RB narrated_VBD on_IN winter_NN evenings_NNS ,_, when_WRB she_PRP chanced_VBD to_TO be_VB in_IN good_JJ humour_NN ;_: and_CC when_WRB ,_, having_VBG brought_VBN her_PRP$ ironing_NN -_HYPH table_NN to_IN the_DT nursery_NN hearth_NN ,_, she_PRP allowed_VBD us_PRP to_TO sit_VB about_IN it_PRP ,_, and_CC while_IN she_PRP got_VBD up_RP Mrs._NNP Reed_NNP ’s_POS lace_NN frills_NNS ,_, and_CC crimped_VBD her_PRP$ nightcap_NN borders_NNS ,_, fed_VBD our_PRP$ eager_JJ attention_NN with_IN passages_NNS of_IN love_NN and_CC adventure_NN taken_VBN from_IN old_JJ fairy_NN tales_NNS and_CC other_JJ ballads_NNS ;_: or_CC (_-LRB- as_IN at_IN a_DT later_JJR period_NN I_PRP discovered_VBD )_-RRB- from_IN the_DT pages_NNS of_IN Pamela_NNP ,_, and_CC Henry_NNP ,_, Earl_NNP of_IN Moreland_NNP ._. + +With_IN Bewick_NNP on_IN my_PRP$ knee_NN ,_, I_PRP was_VBD then_RB happy_JJ :_: happy_JJ at_IN least_JJS in_IN my_PRP$ way_NN ._. I_PRP feared_VBD nothing_NN but_CC interruption_NN ,_, and_CC that_DT came_VBD too_RB soon_RB ._. The_DT breakfast_NN -_HYPH room_NN door_NN opened_VBD ._. + +“_`` Boh_UH !_. Madam_NNP Mope_NNP !_. ”_'' cried_VBD the_DT voice_NN of_IN John_NNP Reed_NNP ;_: then_RB he_PRP paused_VBD :_: he_PRP found_VBD the_DT room_NN apparently_RB empty_JJ ._. + +“_`` Where_WRB the_DT dickens_NNS is_VBZ she_PRP !_. ”_'' he_PRP continued_VBD ._. “_`` Lizzy_NNP !_. Georgy_NNP !_. (_-LRB- calling_VBG to_IN his_PRP$ sisters_NNS )_-RRB- Joan_NNP is_VBZ not_RB here_RB :_: tell_VB mama_NN she_PRP is_VBZ run_VBN out_RP into_IN the_DT rain_NN —_: bad_JJ animal_NN !_. ”_'' + +“_`` It_PRP is_VBZ well_RB I_PRP drew_VBD the_DT curtain_NN ,_, ”_'' thought_VBD I_PRP ;_: and_CC I_PRP wished_VBD fervently_RB he_PRP might_MD not_RB discover_VB my_PRP$ hiding_NN -_HYPH place_NN :_: nor_CC would_MD John_NNP Reed_NNP have_VB found_VBN it_PRP out_RP himself_PRP ;_: he_PRP was_VBD not_RB quick_JJ either_CC of_IN vision_NN or_CC conception_NN ;_: but_CC Eliza_NNP just_RB put_VBD her_PRP$ head_NN in_RP at_IN the_DT door_NN ,_, and_CC said_VBD at_IN once_RB —_: + +“_`` She_PRP is_VBZ in_IN the_DT window_NN -_HYPH seat_NN ,_, to_TO be_VB sure_JJ ,_, Jack_NNP ._. ”_'' + +And_CC I_PRP came_VBD out_RP immediately_RB ,_, for_IN I_PRP trembled_VBD at_IN the_DT idea_NN of_IN being_VBG dragged_VBN forth_RB by_IN the_DT said_VBN Jack_NNP ._. + +“_`` What_WP do_VBP you_PRP want_VB ?_. ”_'' I_PRP asked_VBD ,_, with_IN awkward_JJ diffidence_NN ._. + +“_`` Say_VB ,_, ‘_`` What_WP do_VBP you_PRP want_VB ,_, Master_NNP Reed_NNP ?_. ’_'' ”_'' was_VBD the_DT answer_NN ._. “_`` I_PRP want_VBP you_PRP to_TO come_VB here_RB ;_: ”_'' and_CC seating_VBG himself_PRP in_IN an_DT arm_NN -_HYPH chair_NN ,_, he_PRP intimated_VBD by_IN a_DT gesture_NN that_IN I_PRP was_VBD to_TO approach_VB and_CC stand_VB before_IN him_PRP ._. + +John_NNP Reed_NNP was_VBD a_DT schoolboy_NN of_IN fourteen_CD years_NNS old_JJ ;_: four_CD years_NNS older_JJR than_IN I_PRP ,_, for_IN I_PRP was_VBD but_RB ten_CD :_: large_JJ and_CC stout_JJ for_IN his_PRP$ age_NN ,_, with_IN a_DT dingy_JJ and_CC unwholesome_JJ skin_NN ;_: thick_JJ lineaments_NNS in_IN a_DT spacious_JJ visage_NN ,_, heavy_JJ limbs_NNS and_CC large_JJ extremities_NNS ._. He_PRP gorged_VBD himself_PRP habitually_RB at_IN table_NN ,_, which_WDT made_VBD him_PRP bilious_JJ ,_, and_CC gave_VBD him_PRP a_DT dim_JJ and_CC bleared_JJ eye_NN and_CC flabby_JJ cheeks_NNS ._. He_PRP ought_MD now_RB to_TO have_VB been_VBN at_IN school_NN ;_: but_CC his_PRP$ mama_NN had_VBD taken_VBN him_PRP home_RB for_IN a_DT month_NN or_CC two_CD ,_, “_`` on_IN account_NN of_IN his_PRP$ delicate_JJ health_NN ._. ”_'' Mr._NNP Miles_NNP ,_, the_DT master_NN ,_, affirmed_VBD that_IN he_PRP would_MD do_VB very_RB well_RB if_IN he_PRP had_VBD fewer_JJR cakes_NNS and_CC sweetmeats_NNS sent_VBN him_PRP from_IN home_RB ;_: but_CC the_DT mother_NN ’s_POS heart_NN turned_VBD from_IN an_DT opinion_NN so_RB harsh_JJ ,_, and_CC inclined_VBD rather_RB to_IN the_DT more_RBR refined_JJ idea_NN that_IN John_NNP ’s_POS sallowness_NN was_VBD owing_VBG to_IN over_NN -_HYPH application_NN and_CC ,_, perhaps_RB ,_, to_IN pining_VBG after_IN home_NN ._. + +John_NNP had_VBD not_RB much_JJ affection_NN for_IN his_PRP$ mother_NN and_CC sisters_NNS ,_, and_CC an_DT antipathy_NN to_IN me_PRP ._. He_PRP bullied_VBD and_CC punished_VBD me_PRP ;_: not_RB two_CD or_CC three_CD times_NNS in_IN the_DT week_NN ,_, nor_CC once_RB or_CC twice_RB in_IN the_DT day_NN ,_, but_CC continually_RB :_: every_DT nerve_NN I_PRP had_VBD feared_VBD him_PRP ,_, and_CC every_DT morsel_NN of_IN flesh_NN in_IN my_PRP$ bones_NNS shrank_VBD when_WRB he_PRP came_VBD near_RB ._. There_EX were_VBD moments_NNS when_WRB I_PRP was_VBD bewildered_VBN by_IN the_DT terror_NN he_PRP inspired_VBD ,_, because_IN I_PRP had_VBD no_DT appeal_NN whatever_WDT against_IN either_CC his_PRP$ menaces_NNS or_CC his_PRP$ inflictions_NNS ;_: the_DT servants_NNS did_VBD not_RB like_VB to_TO offend_VB their_PRP$ young_JJ master_NN by_IN taking_VBG my_PRP$ part_NN against_IN him_PRP ,_, and_CC Mrs._NNP Reed_NNP was_VBD blind_JJ and_CC deaf_JJ on_IN the_DT subject_NN :_: she_PRP never_RB saw_VBD him_PRP strike_VB or_CC heard_VBD him_PRP abuse_VB me_PRP ,_, though_IN he_PRP did_VBD both_DT now_RB and_CC then_RB in_IN her_PRP$ very_JJ presence_NN ,_, more_RBR frequently_RB ,_, however_RB ,_, behind_IN her_PRP$ back_NN ._. + +Habitually_RB obedient_JJ to_IN John_NNP ,_, I_PRP came_VBD up_RB to_IN his_PRP$ chair_NN :_: he_PRP spent_VBD some_DT three_CD minutes_NNS in_IN thrusting_VBG out_RP his_PRP$ tongue_NN at_IN me_PRP as_RB far_RB as_IN he_PRP could_MD without_IN damaging_VBG the_DT roots_NNS :_: I_PRP knew_VBD he_PRP would_MD soon_RB strike_VB ,_, and_CC while_IN dreading_VBG the_DT blow_NN ,_, I_PRP mused_VBD on_IN the_DT disgusting_JJ and_CC ugly_JJ appearance_NN of_IN him_PRP who_WP would_MD presently_RB deal_VB it_PRP ._. I_PRP wonder_VBP if_IN he_PRP read_VBD that_DT notion_NN in_IN my_PRP$ face_NN ;_: for_IN ,_, all_RB at_IN once_RB ,_, without_IN speaking_VBG ,_, he_PRP struck_VBD suddenly_RB and_CC strongly_RB ._. I_PRP tottered_VBD ,_, and_CC on_IN regaining_VBG my_PRP$ equilibrium_NN retired_VBD back_RB a_DT step_NN or_CC two_CD from_IN his_PRP$ chair_NN ._. + +“_`` That_DT is_VBZ for_IN your_PRP$ impudence_NN in_IN answering_VBG mama_NN awhile_RB since_RB ,_, ”_'' said_VBD he_PRP ,_, “_`` and_CC for_IN your_PRP$ sneaking_NN way_NN of_IN getting_VBG behind_IN curtains_NNS ,_, and_CC for_IN the_DT look_NN you_PRP had_VBD in_IN your_PRP$ eyes_NNS two_CD minutes_NNS since_RB ,_, you_PRP rat_NN !_. ”_'' + +Accustomed_JJ to_IN John_NNP Reed_NNP ’s_POS abuse_NN ,_, I_PRP never_RB had_VBD an_DT idea_NN of_IN replying_VBG to_IN it_PRP ;_: my_PRP$ care_NN was_VBD how_WRB to_TO endure_VB the_DT blow_NN which_WDT would_MD certainly_RB follow_VB the_DT insult_NN ._. + +“_`` What_WP were_VBD you_PRP doing_VBG behind_IN the_DT curtain_NN ?_. ”_'' he_PRP asked_VBD ._. + +“_`` I_PRP was_VBD reading_VBG ._. ”_'' + +“_`` Show_VB the_DT book_NN ._. ”_'' + +I_PRP returned_VBD to_IN the_DT window_NN and_CC fetched_VBD it_PRP thence_RB ._. + +“_`` You_PRP have_VBP no_DT business_NN to_TO take_VB our_PRP$ books_NNS ;_: you_PRP are_VBP a_DT dependent_NN ,_, mama_NN says_VBZ ;_: you_PRP have_VBP no_DT money_NN ;_: your_PRP$ father_NN left_VBD you_PRP none_NN ;_: you_PRP ought_MD to_TO beg_VB ,_, and_CC not_RB to_TO live_VB here_RB with_IN gentlemen_NNS ’s_POS children_NNS like_IN us_PRP ,_, and_CC eat_VB the_DT same_JJ meals_NNS we_PRP do_VBP ,_, and_CC wear_VB clothes_NNS at_IN our_PRP$ mama_NN ’s_POS expense_NN ._. Now_RB ,_, I_PRP ’ll_MD teach_VB you_PRP to_TO rummage_VB my_PRP$ bookshelves_NNS :_: for_IN they_PRP are_VBP mine_PRP$ ;_: all_PDT the_DT house_NN belongs_VBZ to_IN me_PRP ,_, or_CC will_MD do_VB in_IN a_DT few_JJ years_NNS ._. Go_VB and_CC stand_VB by_IN the_DT door_NN ,_, out_IN of_IN the_DT way_NN of_IN the_DT mirror_NN and_CC the_DT windows_NNS ._. ”_'' + +I_PRP did_VBD so_RB ,_, not_RB at_IN first_RB aware_JJ what_WP was_VBD his_PRP$ intention_NN ;_: but_CC when_WRB I_PRP saw_VBD him_PRP lift_VB and_CC poise_VB the_DT book_NN and_CC stand_VB in_IN act_NN to_TO hurl_VB it_PRP ,_, I_PRP instinctively_RB started_VBD aside_RB with_IN a_DT cry_NN of_IN alarm_NN :_: not_RB soon_RB enough_RB ,_, however_RB ;_: the_DT volume_NN was_VBD flung_VBN ,_, it_PRP hit_VBD me_PRP ,_, and_CC I_PRP fell_VBD ,_, striking_VBG my_PRP$ head_NN against_IN the_DT door_NN and_CC cutting_VBG it_PRP ._. The_DT cut_NN bled_VBD ,_, the_DT pain_NN was_VBD sharp_JJ :_: my_PRP$ terror_NN had_VBD passed_VBN its_PRP$ climax_NN ;_: other_JJ feelings_NNS succeeded_VBD ._. + +“_`` Wicked_JJ and_CC cruel_JJ boy_NN !_. ”_'' I_PRP said_VBD ._. “_`` You_PRP are_VBP like_IN a_DT murderer_NN —_: you_PRP are_VBP like_IN a_DT slave_NN -_HYPH driver_NN —_: you_PRP are_VBP like_IN the_DT Roman_JJ emperors_NNS !_. ”_'' + +I_PRP had_VBD read_VBN Goldsmith_NNP ’s_POS History_NNP of_IN Rome_NNP ,_, and_CC had_VBD formed_VBN my_PRP$ opinion_NN of_IN Nero_NNP ,_, Caligula_NNP ,_, &_FW c._. Also_RB I_PRP had_VBD drawn_VBN parallels_NNS in_IN silence_NN ,_, which_WDT I_PRP never_RB thought_VBD thus_RB to_TO have_VB declared_VBN aloud_RB ._. + +“_`` What_WP !_. what_WP !_. ”_'' he_PRP cried_VBD ._. “_`` Did_VBD she_PRP say_VB that_DT to_IN me_PRP ?_. Did_VBD you_PRP hear_VB her_PRP ,_, Eliza_NNP and_CC Georgiana_NNP ?_. Wo_MD n’t_RB I_PRP tell_VB mama_NN ?_. but_CC first_RB —_: ”_'' + +He_PRP ran_VBD headlong_RB at_IN me_PRP :_: I_PRP felt_VBD him_PRP grasp_VB my_PRP$ hair_NN and_CC my_PRP$ shoulder_NN :_: he_PRP had_VBD closed_VBN with_IN a_DT desperate_JJ thing_NN ._. I_PRP really_RB saw_VBD in_IN him_PRP a_DT tyrant_NN ,_, a_DT murderer_NN ._. I_PRP felt_VBD a_DT drop_NN or_CC two_CD of_IN blood_NN from_IN my_PRP$ head_NN trickle_VB down_IN my_PRP$ neck_NN ,_, and_CC was_VBD sensible_JJ of_IN somewhat_RB pungent_JJ suffering_NN :_: these_DT sensations_NNS for_IN the_DT time_NN predominated_VBD over_IN fear_NN ,_, and_CC I_PRP received_VBD him_PRP in_IN frantic_JJ sort_NN ._. I_PRP do_VBP n’t_RB very_RB well_RB know_VB what_WP I_PRP did_VBD with_IN my_PRP$ hands_NNS ,_, but_CC he_PRP called_VBD me_PRP “_`` Rat_NNP !_. Rat_NN !_. ”_'' and_CC bellowed_VBD out_RP aloud_RB ._. Aid_NN was_VBD near_IN him_PRP :_: Eliza_NNP and_CC Georgiana_NNP had_VBD run_VBN for_IN Mrs._NNP Reed_NNP ,_, who_WP was_VBD gone_VBN upstairs_RB :_: she_PRP now_RB came_VBD upon_IN the_DT scene_NN ,_, followed_VBN by_IN Bessie_NNP and_CC her_PRP$ maid_NN Abbot_NNP ._. We_PRP were_VBD parted_VBN :_: I_PRP heard_VBD the_DT words_NNS —_: + +“_`` Dear_JJ !_. dear_JJ !_. What_WDT a_DT fury_NN to_TO fly_VB at_IN Master_NNP John_NNP !_. ”_'' + +“_`` Did_VBD ever_RB anybody_NN see_VB such_PDT a_DT picture_NN of_IN passion_NN !_. ”_'' + +Then_RB Mrs._NNP Reed_NNP subjoined_VBD —_: + +“_`` Take_VB her_PRP away_RB to_IN the_DT red_JJ -_HYPH room_NN ,_, and_CC lock_VB her_PRP in_RP there_RB ._. ”_'' Four_CD hands_NNS were_VBD immediately_RB laid_VBN upon_IN me_PRP ,_, and_CC I_PRP was_VBD borne_VBN upstairs_RB ._. diff --git a/tests/test_colligation_extractor.py b/tests/test_colligation_extractor.py index d2ef2947d..9ac6ba4d4 100644 --- a/tests/test_colligation_extractor.py +++ b/tests/test_colligation_extractor.py @@ -86,14 +86,18 @@ def update_gui(err_msg, colligations_freqs_files, colligations_stats_files): assert node # Collocate assert collocate + # Frequency (span positions) for freqs_file in freqs_files: assert len(freqs_file) == 10 + # Frequency (total) assert sum((sum(freqs_file) for freqs_file in freqs_files)) >= 0 + # p-value for _, p_value, _, _ in stats_files: assert p_value is None or 0 <= p_value <= 1 + # Number of Files Found assert len([freqs_file for freqs_file in freqs_files[:-1] if sum(freqs_file)]) >= 1 diff --git a/tests/test_collocation_extractor.py b/tests/test_collocation_extractor.py index c02e849d9..fbf7dd0b6 100644 --- a/tests/test_collocation_extractor.py +++ b/tests/test_collocation_extractor.py @@ -86,14 +86,18 @@ def update_gui(err_msg, collocations_freqs_files, collocations_stats_files): assert node # Collocate assert collocate + # Frequency (span positions) for freqs_file in freqs_files: assert len(freqs_file) == 10 + # Frequency (total) assert sum((sum(freqs_file) for freqs_file in freqs_files)) >= 0 + # p-value for _, p_value, _, _ in stats_files: assert p_value is None or 0 <= p_value <= 1 + # Number of Files Found assert len([freqs_file for freqs_file in freqs_files[:-1] if sum(freqs_file)]) >= 1 diff --git a/tests/test_concordancer.py b/tests/test_concordancer.py index f95a60116..b0ed01fcf 100644 --- a/tests/test_concordancer.py +++ b/tests/test_concordancer.py @@ -42,7 +42,7 @@ def test_concordancer(): case _: wl_test_init.select_test_files(main, no_files = [i + 1]) - global main_global # pylint: disable=global-statement + global main_global main_global = main print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}") @@ -66,9 +66,9 @@ def update_gui_table(err_msg, concordance_lines): file_names_selected = list(main_global.wl_file_area.get_selected_file_names()) for concordance_line in concordance_lines: - left_text, left_text_raw, left_text_search = concordance_line[0] - node_text, node_text_raw, node_text_search = concordance_line[1] - right_text, right_text_raw, right_text_search = concordance_line[2] + left_tokens_raw, left_tokens_search = concordance_line[0] + node_tokens_raw, node_tokens_search = concordance_line[1] + right_tokens_raw, right_tokens_search = concordance_line[2] sentiment = concordance_line[3] no_token, len_tokens = concordance_line[4] @@ -78,19 +78,16 @@ def update_gui_table(err_msg, concordance_lines): file_name = concordance_line[8] # Node - assert node_text - assert node_text_raw - assert node_text_search + assert node_tokens_raw + assert node_tokens_search + # Left & Right - assert left_text or right_text - assert left_text == [] or all(left_text) - assert right_text == [] or all(right_text) - assert left_text_raw or right_text_raw - assert left_text_raw == [] or all(left_text_raw) - assert right_text_raw == [] or all(right_text_raw) - assert left_text_search or right_text_search - assert left_text_search == [] or all(left_text_search) - assert right_text_search == [] or all(right_text_search) + assert left_tokens_raw or right_tokens_raw + assert left_tokens_raw == [] or all(left_tokens_raw) + assert right_tokens_raw == [] or all(right_tokens_raw) + assert left_tokens_search or right_tokens_search + assert left_tokens_search == [] or all(left_tokens_search) + assert right_tokens_search == [] or all(right_tokens_search) # Sentiment assert sentiment == 'No language support' or -1 <= sentiment <= 1 diff --git a/tests/test_dependency_parser.py b/tests/test_dependency_parser.py index 9df86e776..cea0d338a 100644 --- a/tests/test_dependency_parser.py +++ b/tests/test_dependency_parser.py @@ -42,7 +42,7 @@ def test_dependency_parser(): case _: wl_test_init.select_test_files(main, no_files = [i + 1]) - global main_global # pylint: disable=global-statement + global main_global main_global = main print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}") diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py index b75ba1c5e..7d431bbc1 100644 --- a/tests/test_keyword_extractor.py +++ b/tests/test_keyword_extractor.py @@ -16,8 +16,6 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- -# pylint: disable=unsupported-assignment-operation - import random from tests import wl_test_init diff --git a/tests/test_ngram_generator.py b/tests/test_ngram_generator.py index ee5588936..f84d85f8b 100644 --- a/tests/test_ngram_generator.py +++ b/tests/test_ngram_generator.py @@ -47,7 +47,7 @@ def test_ngram_generator(): case _: wl_test_init.select_test_files(main, no_files = [i + 1]) - global main_global # pylint: disable=global-statement + global main_global main_global = main settings['generation_settings']['measure_dispersion'] = random.choice(measures_dispersion) diff --git a/tests/test_profiler.py b/tests/test_profiler.py index f96ffaf10..cca98ad2b 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -44,7 +44,7 @@ def test_profiler(): case _: wl_test_init.select_test_files(main, no_files = [i + 1]) - global main_global # pylint: disable=global-statement + global main_global main_global = main print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}") diff --git a/tests/test_wordlist_generator.py b/tests/test_wordlist_generator.py index 6995af40c..58aa0055b 100644 --- a/tests/test_wordlist_generator.py +++ b/tests/test_wordlist_generator.py @@ -44,7 +44,7 @@ def test_wordlist_generator(): case _: wl_test_init.select_test_files(main, no_files = [i + 1]) - global main_global # pylint: disable=global-statement + global main_global main_global = main settings['generation_settings']['measure_dispersion'] = random.choice(measures_dispersion) diff --git a/tests/tests_figs/test_figs.py b/tests/tests_figs/test_figs.py index eab32229a..4fa7e2337 100644 --- a/tests/tests_figs/test_figs.py +++ b/tests/tests_figs/test_figs.py @@ -38,10 +38,10 @@ def test_get_data_ranks(): assert wl_figs.get_data_ranks(data_files_items, fig_settings) == [(str(i), i) for i in range(50)] def test_generate_line_chart(): - main = wl_test_init.Wl_Test_Main() + main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') wl_test_init.select_test_files(main, no_files = [0, 1]) - global main_global # pylint: disable=global-statement + global main_global main_global = main wl_figs.generate_line_chart( diff --git a/tests/tests_figs/test_figs_freqs.py b/tests/tests_figs/test_figs_freqs.py index 0d12a7212..810670f65 100644 --- a/tests/tests_figs/test_figs_freqs.py +++ b/tests/tests_figs/test_figs_freqs.py @@ -20,9 +20,10 @@ from tests import wl_test_init from wordless.wl_figs import wl_figs_freqs +from wordless.wl_nlp import wl_texts def test_wl_fig_freqs(): - main = wl_test_init.Wl_Test_Main() + main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') for tab in [ 'wordlist_generator', @@ -45,10 +46,13 @@ def test_wl_fig_freqs(): if graph_type == 'Network Graph': for node in range(10): + node = wl_texts.Wl_Token(str(node)) + for collocate in range(10): + collocate = wl_texts.Wl_Token(str(collocate)) freq_1, freq_2 = random.sample(range(10000), 2) - freq_files_items[(str(node), str(collocate))] = [ + freq_files_items[(node, collocate)] = [ max(freq_1, freq_2) - min(freq_1, freq_2), min(freq_1, freq_2), max(freq_1, freq_2) @@ -56,9 +60,10 @@ def test_wl_fig_freqs(): else: if tab == 'keyword_extractor': for item in range(100): + item = wl_texts.Wl_Token(str(item)) freq_1, freq_2 = random.sample(range(100), 2) - freq_files_items[str(item)] = [ + freq_files_items[item] = [ random.randint(0, 100), max(freq_1, freq_2) - min(freq_1, freq_2), min(freq_1, freq_2), @@ -66,9 +71,10 @@ def test_wl_fig_freqs(): ] else: for item in range(100): + item = wl_texts.Wl_Token(str(item)) freq_1, freq_2 = random.sample(range(100), 2) - freq_files_items[str(item)] = [ + freq_files_items[item] = [ max(freq_1, freq_2) - min(freq_1, freq_2), min(freq_1, freq_2), max(freq_1, freq_2) diff --git a/tests/tests_figs/test_figs_stats.py b/tests/tests_figs/test_figs_stats.py index 7c73d6ee7..e631ddffb 100644 --- a/tests/tests_figs/test_figs_stats.py +++ b/tests/tests_figs/test_figs_stats.py @@ -20,9 +20,10 @@ from tests import wl_test_init from wordless.wl_figs import wl_figs_stats +from wordless.wl_nlp import wl_texts def test_wl_fig_stats(): - main = wl_test_init.Wl_Test_Main() + main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') for tab in [ 'wordlist_generator', @@ -55,15 +56,19 @@ def test_wl_fig_stats(): if graph_type == 'Network Graph': for node in range(10): + node = wl_texts.Wl_Token(str(node)) + for collocate in range(10): - stat_files_items[(str(node), str(collocate))] = [ + collocate = wl_texts.Wl_Token(str(collocate)) + stat_files_items[(node, collocate)] = [ random.uniform(0, val_max), random.uniform(0, val_max), random.uniform(0, val_max) ] else: for item in range(100): - stat_files_items[str(item)] = [ + item = wl_texts.Wl_Token(str(item)) + stat_files_items[item] = [ random.uniform(0, val_max), random.uniform(0, val_max), random.uniform(0, val_max) diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py index 623289b8c..04e1d3daf 100644 --- a/tests/tests_file_area/test_file_area_file_types.py +++ b/tests/tests_file_area/test_file_area_file_types.py @@ -28,7 +28,7 @@ from wordless.wl_dialogs import wl_dialogs_misc from wordless.wl_nlp import wl_texts -main = wl_test_init.Wl_Test_Main() +main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') def add_file(file_paths, update_gui, file_type = 'observed'): def open_file(err_msg, files_to_open): @@ -152,59 +152,67 @@ def update_gui_file_types(err_msg, new_files): file_name = os.path.split(new_files[0]['path'])[1] file_text = new_files[0]['text'] - print(file_text.tokens_multilevel) - print(file_text.tags) + tokens = file_text.to_token_texts() + tags = file_text.get_token_properties('tag') + + print(tokens) # CSV files if file_name == 'csv.txt': - assert file_text.tokens_multilevel == [[], [], [[['3', '-', '2']], [['3', '-', '3']]], [], [], [[['6', '-', '2']], [['6', '-', '3']]], [], []] + assert tokens == [[], [], [[['3-2', '3-3']]], [], [], [[['6-2', '6-3']]], [], []] # Excel workbooks elif file_name == 'xlsx.txt': - assert file_text.tokens_multilevel == [[], [[['B2', '&', 'C2']], [['D2']]], [[['B3', '&', 'B4']], [['C3']], [['D3']]], [[['C4']], [['D4']]], [[['B5']], [['C5']], [['D5']]], [], [], [[['B2', '&', 'C2']], [['D2']]], [[['B3', '&', 'B4']], [['C3']], [['D3']]], [[['C4']], [['D4']]], [[['B5']], [['C5']], [['D5']]]] + assert tokens == [[], [[['B2', '&', 'C2', 'D2']]], [[['B3', '&', 'B4', 'C3', 'D3']]], [[['C4', 'D4']]], [[['B5', 'C5', 'D5']]], [], [], [[['B2', '&', 'C2', 'D2']]], [[['B3', '&', 'B4', 'C3', 'D3']]], [[['C4', 'D4']]], [[['B5', 'C5', 'D5']]]] # HTML pages elif file_name == 'html.txt': - assert file_text.tokens_multilevel == [[], [], [[['This', 'is', 'a', 'title']]], [], [], [[['Hello', 'world', '!']]], [], []] + assert tokens == [[], [], [[['This', 'is', 'a', 'title']]], [], [], [[['Hello', 'world', '!']]], [], []] # PDF files elif file_name == 'pdf.txt': - assert file_text.tokens_multilevel == [[[['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ','], ['consetetur', 'sadipscing', 'elitr', ','], ['sed', 'diam', 'nonumy', 'eirmod']]], [[['tempor', 'invidunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliquyam', 'erat', ','], ['sed', 'diam', 'voluptua', '.']], [['At', 'vero']]], [[['eos', 'et', 'accusam', 'et', 'justo', 'duo', 'dolores', 'et', 'ea', 'rebum', '.']], [['Stet', 'clita', 'kasd', 'gubergren', ','], ['no', 'sea', 'taki-']]], [[['mata', 'sanctus', 'est', 'Lorem', 'ipsum', 'dolor', 'sit', 'amet', '.']], [['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ','], ['consetetur']]], [[['sadipscing', 'elitr', ','], ['sed', 'diam', 'nonumy', 'eirmod', 'tempor', 'invidunt', 'ut', 'labore', 'et', 'dolore', 'magna']]], [[['aliquyam', 'erat', ','], ['sed', 'diam', 'voluptua', '.']], [['At', 'vero', 'eos', 'et', 'accusam', 'et', 'justo', 'duo', 'dolores', 'et', 'ea']]], [[['rebum', '.']], [['Stet', 'clita', 'kasd', 'gubergren', ','], ['no', 'sea', 'takimata', 'sanctus', 'est', 'Lorem', 'ipsum', 'dolor', 'sit']]], [[['amet', '.']]], [[['1']]]] + assert tokens == [[[['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ','], ['consetetur', 'sadipscing', 'elitr', ','], ['sed', 'diam', 'nonumy', 'eirmod']]], [[['tempor', 'invidunt', 'ut', 'labore', 'et', 'dolore', 'magna', 'aliquyam', 'erat', ','], ['sed', 'diam', 'voluptua', '.']], [['At', 'vero']]], [[['eos', 'et', 'accusam', 'et', 'justo', 'duo', 'dolores', 'et', 'ea', 'rebum', '.']], [['Stet', 'clita', 'kasd', 'gubergren', ','], ['no', 'sea', 'taki-']]], [[['mata', 'sanctus', 'est', 'Lorem', 'ipsum', 'dolor', 'sit', 'amet', '.']], [['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ','], ['consetetur']]], [[['sadipscing', 'elitr', ','], ['sed', 'diam', 'nonumy', 'eirmod', 'tempor', 'invidunt', 'ut', 'labore', 'et', 'dolore', 'magna']]], [[['aliquyam', 'erat', ','], ['sed', 'diam', 'voluptua', '.']], [['At', 'vero', 'eos', 'et', 'accusam', 'et', 'justo', 'duo', 'dolores', 'et', 'ea']]], [[['rebum', '.']], [['Stet', 'clita', 'kasd', 'gubergren', ','], ['no', 'sea', 'takimata', 'sanctus', 'est', 'Lorem', 'ipsum', 'dolor', 'sit']]], [[['amet', '.']]], [[['1']]]] # Word documents elif file_name == 'docx.txt': - assert file_text.tokens_multilevel == [[], [[['Heading']]], [], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], [], [[['2', '-', '2/3']], [['2', '-', '4']]], [[['3/4', '-', '2', '3', '-', '3', '3', '-', '4']]], [[['4', '-', '3', '4', '-', '4', '4', '-', '4', '-', '1/2', '4', '-', '4', '-', '3/5', '4', '-', '4', '-', '4', '4', '-', '4', '-', '6']]], [], []] + assert tokens == [[], [[['Heading']]], [], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], [], [[['2-2/3', '2-4']]], [[['3/4-2', '3-3', '3-4']]], [[['4-3', '4-4', '4-4-1/2', '4-4-3/5', '4-4-4', '4-4-6']]], [], []] # XML files elif file_name == 'xml.xml': - assert file_text.tokens_multilevel == [[[['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?']]], [[['AIDS', '(', 'Acquired', 'Immune', 'Deficiency', 'Syndrome', ')', 'is', 'a', 'condition', 'caused', 'by', 'a', 'virus', 'called', 'HIV', '(', 'Human', 'Immuno', 'Deficiency', 'Virus', ')', '.']], [['This', 'virus', 'affects', 'the', 'body', "'s", 'defence', 'system', 'so', 'that', 'it', 'can', 'not', 'fight', 'infection', '.']]]] + assert tokens == [[[['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?']]], [[['AIDS', '(', 'Acquired', 'Immune', 'Deficiency', 'Syndrome', ')', 'is', 'a', 'condition', 'caused', 'by', 'a', 'virus', 'called', 'HIV', '(', 'Human', 'Immuno', 'Deficiency', 'Virus', ')', '.']], [['This', 'virus', 'affects', 'the', 'body', "'s", 'defence', 'system', 'so', 'that', 'it', 'can', 'not', 'fight', 'infection', '.']]]] # XML tags unfound or unspecified elif file_name in ['xml (2).xml', 'xml (3).xml']: - assert file_text.tokens_multilevel == [[], [], [[['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?']]], [[['AIDS', '(', 'Acquired', 'Immune', 'Deficiency', 'Syndrome)is', 'a', 'condition', 'caused', 'by', 'a', 'virus', 'called', 'HIV', '(', 'Human', 'Immuno', 'Deficiency', 'Virus', ')', '.']]], [[['This', 'virus', 'affects', 'the', 'body', "'s", 'defence', 'system', 'so', 'that', 'it', 'can', 'not', 'fight', 'infection', '.']]]] + assert tokens == [[], [], [[['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?']]], [[['AIDS', '(', 'Acquired', 'Immune', 'Deficiency', 'Syndrome', ')', 'is', 'a', 'condition', 'caused', 'by', 'a', 'virus', 'called', 'HIV', '(', 'Human', 'Immuno', 'Deficiency', 'Virus', ')', '.']]], [[['This', 'virus', 'affects', 'the', 'body', "'s", 'defence', 'system', 'so', 'that', 'it', 'can', 'not', 'fight', 'infection', '.']]]] # Untokenized & Untagged elif file_name == 'xml (4).xml': - assert file_text.tokens_multilevel == [[[['<', 'bncDoc', 'xml', ':'], ['id="A00">']], [['[', 'ACET', 'factsheets', '&', 'amp', ';'], ['newsletters', ']', '.'], ['Sample', 'containing', 'about', '6688', 'words', 'of', 'miscellanea', '(', 'domain', ':'], ['social', 'science', ')']], [['<', '/title>', 'Data', 'capture', 'and', 'transcription', '<', '/resp>', 'Oxford', 'University', 'Press']], [['<']], [['/name']], [['>']], [['<', '/respStmt>', 'BNC', 'XML', 'Edition', ','], ['December', '2006']], [['6688', 'tokens', ';'], ['6708', 'w', '-', 'units', ';'], ['423', 's', '-', 'units']], [['<', '/extent>', 'Distributed', 'under', 'licence', 'by', 'Oxford', 'University', 'Computing', 'Services', 'on', 'behalf', 'of', 'the', 'BNC', 'Consortium.']], [['This', 'material', 'is', 'protected', 'by', 'international', 'copyright', 'laws', 'and', 'may', 'not', 'be', 'copied', 'or', 'redistributed', 'in', 'any', 'way', '.']], [['Consult', 'the', 'BNC', 'Web', 'Site', 'at', 'http://www.natcorp.ox.ac.uk', 'for', 'full', 'licencing', 'and', 'distribution', 'conditions.A00', 'AidFct', '<', '/idno>']], [['[', 'ACET', 'factsheets', '&', 'amp', ';'], ['newsletters', ']', '.']], [['<', '/title']], [['>']], [['<', 'imprint', 'n="AIDSCA1">']], [['Aids']], [['Care']], [['Education', '&', 'amp', ';'], ['Training']], [['<', '/publisher']], [['>']], [['<', 'pubPlace', '>', 'London']], [['<', '/pubPlace']], [['>']], [['<', 'date', 'value="1991', '-', '09', '"', '>', '1991', '-', '09']], [['<']], [['/date']], [['>']], [['<', '/imprint', '>', '<', '/bibl>1991', '-', '09', '<', '/creation>W', 'nonAc', ':'], ['medicine', 'Health', '<', '/term>', 'Sex', '<', '/term>Tag', 'usage', 'updated', 'for', 'BNC', '-', 'XMLLast', 'check', 'for', 'BNC', 'World', 'first', 'releaseRedo', 'tagusage', 'tablesCheck', 'all', 'tagcountsResequenced', 's', '-', 'units', 'and', 'added', 'headersAdded', 'date', 'infoUpdated', 'all', 'catrefsManually', 'updated', 'tagcounts', ','], ['titlestmt', ','], ['and', 'title', 'in', 'sourcePOS', 'codes', 'revised', 'for', 'BNC-2', ';'], ['header', 'updatedInitial', 'accession', 'to', 'corpus']]], [[['<', 'wtext', 'type="NONAC">']]], [[['<', 's', 'n="1">FACTSHEET', '<', '/w>WHAT', '<', '/w>IS', '<', '/w>AIDS?']]], [[['<', 's', 'n="2">AIDS']], [['<', '/w>(Acquired', '<', '/w>Immune', '<', '/w>Deficiency', '<', '/w>Syndrome)is', '<', '/w>a', '<', '/w>condition', '<', '/w>caused', '<', '/w>by', '<', '/w>a', '<', '/w>virus', '<', '/w>called', '<', '/w>HIV', '<', '/w>(Human', '<', '/w>Immuno', '<', '/w>Deficiency', '<', '/w>Virus).']]], [[['<', 's', 'n="3">This', '<', '/w>virus', '<', '/w>affects', '<', '/w>the', '<', '/w>body', "'s", '<', '/w>defence', '<', '/w>system', '<', '/w>so', '<', '/w>that', '<', '/w>it', '<', '/w>cannot', '<', '/w>fight', '<']], [['/w>infection.']]]] + assert tokens == [[[['<', 'bncDoc', 'xml', ':'], ['id=', "''", 'A00', "''", '>', '<', 'teiHeader', '>', '<', 'fileDesc', '>', '<', 'titleStmt', '>', '<', 'title', '>', '[', 'ACET', 'factsheets', '&', 'amp', ';'], ['newsletters', ']', '.']], [['Sample', 'containing', 'about', '6688', 'words', 'of', 'miscellanea', '(', 'domain', ':'], ['social', 'science', ')', '<', '/title', '>', '<', 'respStmt', '>', '<', 'resp', '>', 'Data', 'capture', 'and', 'transcription', '<', '/resp', '>', '<', 'name', '>', 'Oxford', 'University', 'Press', '<', '/name', '>', '<', '/respStmt', '>', '<', '/titleStmt', '>', '<', 'editionStmt', '>', '<', 'edition', '>', 'BNC', 'XML', 'Edition', ','], ['December', '2006', '<', '/edition', '>', '<', '/editionStmt', '>', '<', 'extent', '>', '6688', 'tokens', ';'], ['6708', 'w-units', ';'], ['423', 's-units', '<', '/extent', '>', '<', 'publicationStmt', '>', '<', 'distributor', '>', 'Distributed', 'under', 'licence', 'by', 'Oxford', 'University', 'Computing', 'Services', 'on', 'behalf', 'of', 'the', 'BNC', 'Consortium.'], ['<', '/distributor', '>', '<', 'availability', '>', 'This', 'material', 'is', 'protected', 'by', 'international', 'copyright', 'laws', 'and', 'may', 'not', 'be', 'copied', 'or', 'redistributed', 'in', 'any', 'way', '.']], [['Consult', 'the', 'BNC', 'Web', 'Site', 'at', 'http', ':'], ['//www.natcorp.ox.ac.uk', 'for', 'full', 'licencing', 'and', 'distribution', 'conditions.'], ['<', '/availability', '>', '<', 'idno', 'type=', "''", 'bnc', "''", '>', 'A00', '<', '/idno', '>', '<', 'idno', 'type=', "''", 'old', "''", '>', 'AidFct', '<', '/idno', '>', '<', '/publicationStmt', '>', '<', 'sourceDesc', '>', '<', 'bibl', '>', '<', 'title', '>', '[', 'ACET', 'factsheets', '&', 'amp', ';'], ['newsletters', ']', '.']], [['<', '/title', '>', '<', 'imprint', 'n=', "''", 'AIDSCA1', "''", '>', '<', 'publisher', '>', 'Aids', 'Care', 'Education', '&', 'amp', ';'], ['Training', '<', '/publisher', '>', '<', 'pubPlace', '>', 'London', '<', '/pubPlace', '>', '<', 'date', 'value=', "''", '1991-09', "''", '>', '1991-09', '<', '/date', '>', '<', '/imprint', '>', '<', '/bibl', '>', '<', '/sourceDesc', '>', '<', '/fileDesc', '>', '<', 'encodingDesc', '>', '<', 'tagsDecl', '>', '<', 'namespace', 'name=', "''", "''", '>', '<', 'tagUsage', 'gi=', "''", 'c', "''", 'occurs=', "''", '810', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'div', "''", 'occurs=', "''", '43', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'head', "''", 'occurs=', "''", '45', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'hi', "''", 'occurs=', "''", '24', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'item', "''", 'occurs=', "''", '43', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'label', "''", 'occurs=', "''", '10', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'list', "''", 'occurs=', "''", '8', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'mw', "''", 'occurs=', "''", '31', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'p', "''", 'occurs=', "''", '118', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'pb', "''", 'occurs=', "''", '2', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 's', "''", 'occurs=', "''", '423', "''", '/', '>', '<', 'tagUsage', 'gi=', "''", 'w', "''", 'occurs=', "''", '6708', "''", '/', '>', '<', '/namespace', '>', '<', '/tagsDecl', '>', '<', '/encodingDesc', '>', '<', 'profileDesc', '>', '<', 'creation', 'date=', "''", '1991', "''", '>', '1991-09', '<', '/creation', '>', '<', 'textClass', '>', '<', 'catRef', 'targets=', "''", 'WRI', 'ALLTIM3', 'ALLAVA2', 'ALLTYP5', 'WRIAAG0', 'WRIAD0', 'WRIASE0', 'WRIATY2', 'WRIAUD3', 'WRIDOM4', 'WRILEV2', 'WRIMED3', 'WRIPP5', 'WRISAM5', 'WRISTA2', 'WRITAS3', "''", '/', '>', '<', 'classCode', 'scheme=', "''", 'DLEE', "''", '>', 'W', 'nonAc', ':'], ['medicine', '<', '/classCode', '>', '<', 'keywords', '>', '<', 'term', '>', 'Health', '<', '/term', '>', '<', 'term', '>', 'Sex', '<', '/term', '>', '<', '/keywords', '>', '<', '/textClass', '>', '<', '/profileDesc', '>', '<', 'revisionDesc', '>', '<', 'change', 'date=', "''", '2006-10-21', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Tag', 'usage', 'updated', 'for', 'BNC-XML', '<', '/change', '>', '<', 'change', 'date=', "''", '2000-12-13', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Last', 'check', 'for', 'BNC', 'World', 'first', 'release', '<', '/change', '>', '<', 'change', 'date=', "''", '2000-09-06', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Redo', 'tagusage', 'tables', '<', '/change', '>', '<', 'change', 'date=', "''", '2000-09-01', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Check', 'all', 'tagcounts', '<', '/change', '>', '<', 'change', 'date=', "''", '2000-06-23', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Resequenced', 's-units', 'and', 'added', 'headers', '<', '/change', '>', '<', 'change', 'date=', "''", '2000-01-21', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Added', 'date', 'info', '<', '/change', '>', '<', 'change', 'date=', "''", '2000-01-09', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Updated', 'all', 'catrefs', '<', '/change', '>', '<', 'change', 'date=', "''", '2000-01-08', "''", 'who=', "''", '#', 'OUCS', "''", '>', 'Manually', 'updated', 'tagcounts', ','], ['titlestmt', ','], ['and', 'title', 'in', 'source', '<', '/change', '>', '<', 'change', 'date=', "''", '1999-09-13', "''", 'who=', "''", '#', 'UCREL', "''", '>', 'POS', 'codes', 'revised', 'for', 'BNC-2', ';'], ['header', 'updated', '<', '/change', '>', '<', 'change', 'date=', "''", '1994-11-24', "''", 'who=', "''", '#', 'dominic', "''", '>', 'Initial', 'accession', 'to', 'corpus', '<', '/change', '>', '<', '/revisionDesc', '>', '<', '/teiHeader', '>']]], [[['<', 'wtext', 'type=', "''", 'NONAC', "''", '>', '<', 'div', 'level=', "''", '1', "''", 'n=', "''", '1', "''", 'type=', "''", 'leaflet', "''", '>', '<', 'head', 'type=', "''", 'MAIN', "''", '>']]], [[['<', 's', 'n=', "''", '1', "''", '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'factsheet', "''", 'pos=', "''", 'SUBST', "''", '>', 'FACTSHEET', '<', '/w', '>', '<', 'w', 'c5=', "''", 'DTQ', "''", 'hw=', "''", 'what', "''", 'pos=', "''", 'PRON', "''", '>', 'WHAT', '<', '/w', '>', '<', 'w', 'c5=', "''", 'VBZ', "''", 'hw=', "''", 'be', "''", 'pos=', "''", 'VERB', "''", '>', 'IS', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'aids', "''", 'pos=', "''", 'SUBST', "''", '>', 'AIDS', '<', '/w', '>', '<', 'c', 'c5=', "''", 'PUN', "''", '>', '?'], ['<', '/c', '>', '<', '/s', '>', '<', '/head', '>', '<', 'p', '>']]], [[['<', 's', 'n=', "''", '2', "''", '>', '<', 'hi', 'rend=', "''", 'bo', "''", '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'aids', "''", 'pos=', "''", 'SUBST', "''", '>', 'AIDS', '<', '/w', '>', '<', 'c', 'c5=', "''", 'PUL', "''", '>', '(', '<', '/c', '>', '<', 'w', 'c5=', "''", 'VVN-AJ0', "''", 'hw=', "''", 'acquire', "''", 'pos=', "''", 'VERB', "''", '>', 'Acquired', '<', '/w', '>', '<', 'w', 'c5=', "''", 'AJ0', "''", 'hw=', "''", 'immune', "''", 'pos=', "''", 'ADJ', "''", '>', 'Immune', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'deficiency', "''", 'pos=', "''", 'SUBST', "''", '>', 'Deficiency', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'syndrome', "''", 'pos=', "''", 'SUBST', "''", '>', 'Syndrome', '<', '/w', '>', '<', 'c', 'c5=', "''", 'PUR', "''", '>', ')', '<', '/c', '>', '<', '/hi', '>', '<', 'w', 'c5=', "''", 'VBZ', "''", 'hw=', "''", 'be', "''", 'pos=', "''", 'VERB', "''", '>', 'is', '<', '/w', '>', '<', 'w', 'c5=', "''", 'AT0', "''", 'hw=', "''", 'a', "''", 'pos=', "''", 'ART', "''", '>', 'a', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'condition', "''", 'pos=', "''", 'SUBST', "''", '>', 'condition', '<', '/w', '>', '<', 'w', 'c5=', "''", 'VVN', "''", 'hw=', "''", 'cause', "''", 'pos=', "''", 'VERB', "''", '>', 'caused', '<', '/w', '>', '<', 'w', 'c5=', "''", 'PRP', "''", 'hw=', "''", 'by', "''", 'pos=', "''", 'PREP', "''", '>', 'by', '<', '/w', '>', '<', 'w', 'c5=', "''", 'AT0', "''", 'hw=', "''", 'a', "''", 'pos=', "''", 'ART', "''", '>', 'a', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'virus', "''", 'pos=', "''", 'SUBST', "''", '>', 'virus', '<', '/w', '>', '<', 'w', 'c5=', "''", 'VVN', "''", 'hw=', "''", 'call', "''", 'pos=', "''", 'VERB', "''", '>', 'called', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NP0', "''", 'hw=', "''", 'hiv', "''", 'pos=', "''", 'SUBST', "''", '>', 'HIV', '<', '/w', '>', '<', 'c', 'c5=', "''", 'PUL', "''", '>', '(', '<', '/c', '>', '<', 'w', 'c5=', "''", 'AJ0-NN1', "''", 'hw=', "''", 'human', "''", 'pos=', "''", 'ADJ', "''", '>', 'Human', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'immuno', "''", 'pos=', "''", 'SUBST', "''", '>', 'Immuno', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'deficiency', "''", 'pos=', "''", 'SUBST', "''", '>', 'Deficiency', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'virus', "''", 'pos=', "''", 'SUBST', "''", '>', 'Virus', '<', '/w', '>', '<', 'c', 'c5=', "''", 'PUR', "''", '>', ')', '<', '/c', '>', '<', 'c', 'c5=', "''", 'PUN', "''", '>', '.'], ['<', '/c', '>', '<', '/s', '>']]], [[['<', 's', 'n=', "''", '3', "''", '>', '<', 'w', 'c5=', "''", 'DT0', "''", 'hw=', "''", 'this', "''", 'pos=', "''", 'ADJ', "''", '>', 'This', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'virus', "''", 'pos=', "''", 'SUBST', "''", '>', 'virus', '<', '/w', '>', '<', 'w', 'c5=', "''", 'VVZ', "''", 'hw=', "''", 'affect', "''", 'pos=', "''", 'VERB', "''", '>', 'affects', '<', '/w', '>', '<', 'w', 'c5=', "''", 'AT0', "''", 'hw=', "''", 'the', "''", 'pos=', "''", 'ART', "''", '>', 'the', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'body', "''", 'pos=', "''", 'SUBST', "''", '>', 'body', '<', '/w', '>', '<', 'w', 'c5=', "''", 'POS', "''", 'hw=', "''", "'s", "''", 'pos=', "''", 'UNC', "''", '>', "'s", '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'defence', "''", 'pos=', "''", 'SUBST', "''", '>', 'defence', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'system', "''", 'pos=', "''", 'SUBST', "''", '>', 'system', '<', '/w', '>', '<', 'mw', 'c5=', "''", 'CJS', "''", '>', '<', 'w', 'c5=', "''", 'AV0', "''", 'hw=', "''", 'so', "''", 'pos=', "''", 'ADV', "''", '>', 'so', '<', '/w', '>', '<', 'w', 'c5=', "''", 'CJT', "''", 'hw=', "''", 'that', "''", 'pos=', "''", 'CONJ', "''", '>', 'that', '<', '/w', '>', '<', '/mw', '>', '<', 'w', 'c5=', "''", 'PNP', "''", 'hw=', "''", 'it', "''", 'pos=', "''", 'PRON', "''", '>', 'it', '<', '/w', '>', '<', 'w', 'c5=', "''", 'VM0', "''", 'hw=', "''", 'can', "''", 'pos=', "''", 'VERB', "''", '>', 'can', '<', '/w', '>', '<', 'w', 'c5=', "''", 'XX0', "''", 'hw=', "''", 'not', "''", 'pos=', "''", 'ADV', "''", '>', 'not', '<', '/w', '>', '<', 'w', 'c5=', "''", 'VVI', "''", 'hw=', "''", 'fight', "''", 'pos=', "''", 'VERB', "''", '>', 'fight', '<', '/w', '>', '<', 'w', 'c5=', "''", 'NN1', "''", 'hw=', "''", 'infection', "''", 'pos=', "''", 'SUBST', "''", '>', 'infection', '<', '/w', '>', '<', 'c', 'c5=', "''", 'PUN', "''", '>', '.'], ['<', '/c', '>', '<', '/s', '>', '<', '/p', '>']]]] # Tokenized & Untagged elif file_name == 'xml (5).xml': - assert file_text.tokens_multilevel == [[[['', '[ACET', 'factsheets', '&'], ['newsletters].']], [['Sample', 'containing', 'about', '6688', 'words', 'of', 'miscellanea', '(domain:'], ['social', 'science)', '', 'Data', 'capture', 'and', 'transcription', '', 'Oxford', 'University', 'Press', '', 'BNC', 'XML', 'Edition,'], ['December', '2006', '6688', 'tokens;'], ['6708', 'w-units;'], ['423', 's-units', 'Distributed', 'under', 'licence', 'by', 'Oxford', 'University', 'Computing', 'Services', 'on', 'behalf', 'of', 'the', 'BNC', 'Consortium.']], [['', 'This', 'material', 'is', 'protected', 'by', 'international', 'copyright', 'laws', 'and', 'may', 'not', 'be', 'copied', 'or', 'redistributed', 'in', 'any', 'way.']], [['Consult', 'the', 'BNC', 'Web', 'Site', 'at', 'http:'], ['//www.']], [['natcorp.']], [['ox.']], [['ac.']], [['uk', 'for', 'full', 'licencing', 'and', 'distribution', 'conditions.']], [['A00', 'AidFct', '', '[ACET', 'factsheets', '&'], ['newsletters].']], [['', '', 'Aids', 'Care', 'Education', '&'], ['Training', '', '', 'London', '', '', '1991-09', '', '', '1991-09', 'W', 'nonAc:'], ['medicine', 'Health', '', 'Sex', 'Tag', 'usage', 'updated', 'for', 'BNC-XMLLast', 'check', 'for', 'BNC', 'World', 'first', 'releaseRedo', 'tagusage', 'tablesCheck', 'all', 'tagcountsResequenced', 's-units', 'and', 'added', 'headersAdded', 'date', 'infoUpdated', 'all', 'catrefsManually', 'updated', 'tagcounts,'], ['titlestmt,'], ['and', 'title', 'in', 'sourcePOS', 'codes', 'revised', 'for', 'BNC-2;'], ['header', 'updatedInitial', 'accession', 'to', 'corpus']]], [[['']]], [[['FACTSHEET', 'WHAT', 'IS', 'AIDS?']], [['

']]], [[['AIDS', '(Acquired', 'Immune', 'Deficiency', 'Syndrome)is', 'a', 'condition', 'caused', 'by', 'a', 'virus', 'called', 'HIV', '(Human', 'Immuno', 'Deficiency', 'Virus).']], [['']]], [[['This', 'virus', 'affects', 'the', 'body\'s', 'defence', 'system', 'so', 'that', 'it', 'cannot', 'fight', 'infection.']], [['

']]]] + assert tokens == [[[['', '[ACET', 'factsheets', '&'], ['newsletters].']], [['Sample', 'containing', 'about', '6688', 'words', 'of', 'miscellanea', '(domain:'], ['social', 'science)', '', 'Data', 'capture', 'and', 'transcription', '', 'Oxford', 'University', 'Press', '', 'BNC', 'XML', 'Edition,'], ['December', '2006', '6688', 'tokens;'], ['6708', 'w-units;'], ['423', 's-units', 'Distributed', 'under', 'licence', 'by', 'Oxford', 'University', 'Computing', 'Services', 'on', 'behalf', 'of', 'the', 'BNC', 'Consortium.']], [['', 'This', 'material', 'is', 'protected', 'by', 'international', 'copyright', 'laws', 'and', 'may', 'not', 'be', 'copied', 'or', 'redistributed', 'in', 'any', 'way.']], [['Consult', 'the', 'BNC', 'Web', 'Site', 'at', 'http:'], ['//www.']], [['natcorp.']], [['ox.']], [['ac.']], [['uk', 'for', 'full', 'licencing', 'and', 'distribution', 'conditions.']], [['A00', 'AidFct', '', '[ACET', 'factsheets', '&'], ['newsletters].']], [['', '', 'Aids', 'Care', 'Education', '&'], ['Training', '', '', 'London', '', '', '1991-09', '', '', '1991-09', 'W', 'nonAc:'], ['medicine', 'Health', '', 'Sex', 'Tag', 'usage', 'updated', 'for', 'BNC-XMLLast', 'check', 'for', 'BNC', 'World', 'first', 'releaseRedo', 'tagusage', 'tablesCheck', 'all', 'tagcountsResequenced', 's-units', 'and', 'added', 'headersAdded', 'date', 'infoUpdated', 'all', 'catrefsManually', 'updated', 'tagcounts,'], ['titlestmt,'], ['and', 'title', 'in', 'sourcePOS', 'codes', 'revised', 'for', 'BNC-2;'], ['header', 'updatedInitial', 'accession', 'to', 'corpus']]], [[['']]], [[['FACTSHEET', 'WHAT', 'IS', 'AIDS?']], [['

']]], [[['AIDS', '(Acquired', 'Immune', 'Deficiency', 'Syndrome)is', 'a', 'condition', 'caused', 'by', 'a', 'virus', 'called', 'HIV', '(Human', 'Immuno', 'Deficiency', 'Virus).']], [['']]], [[['This', 'virus', 'affects', 'the', 'body\'s', 'defence', 'system', 'so', 'that', 'it', 'cannot', 'fight', 'infection.']], [['

']]]] - assert file_text.tags == [[] for _ in file_text.get_tokens_flat()] + assert tags == [None] * file_text.num_tokens # TMX files elif len(new_files) == 2: file_text_src = new_files[0]['text'] file_text_tgt = new_files[1]['text'] + tokens_src = file_text_src.to_token_texts() + tags_src = file_text_src.get_token_properties('tag') + # Source files print(file_text_src.lang) - print(file_text_src.tokens_multilevel) - print(file_text_src.tags) + print(tokens_src) assert file_text_src.lang == 'eng_us' - assert file_text_src.tokens_multilevel == [[[['Hello', 'world', '!']]]] + assert tokens_src == [[[['Hello', 'world', '!']]]] + assert tags_src == [None] * 3 # Target files + tokens_tgt = file_text_tgt.to_token_texts() + tags_tgt = file_text_tgt.get_token_properties('tag') + print(file_text_tgt.lang) - print(file_text_tgt.tokens_multilevel) - print(file_text_tgt.tags) + print(tokens_tgt) # Avoid loading the French model assert file_text_tgt.lang == 'eng_gb' - assert file_text_tgt.tokens_multilevel == [[[['Bonjour', 'tout', 'le', 'monde', '!']]]] + assert tokens_tgt == [[[['Bonjour', 'tout', 'le', 'monde', '!']]]] + assert tags_tgt == [None] * 5 def update_gui_unicode_decode_error(err_msg, new_files): assert not err_msg @@ -217,23 +225,26 @@ def update_gui_tags(err_msg, new_files): file_name = os.path.split(new_files[0]['path'])[1] file_text = new_files[0]['text'] - print(file_text.tokens_multilevel) - print(file_text.tags) + tokens = file_text.to_token_texts() + tags = file_text.get_token_properties('tag') + + print(tokens) + print(tags) if file_name == 'untokenized_untagged.txt': - assert file_text.tokens_multilevel == [[], [], [[['This', '<', 'TAG', '>', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] - assert file_text.tags == [[] for _ in file_text.get_tokens_flat()] + assert tokens == [[], [], [[['This', '<', 'TAG', '>', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] + assert tags == [None] * file_text.num_tokens elif file_name == 'untokenized_tagged.txt': - assert file_text.tokens_multilevel == [[[['']]], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] - assert file_text.tags == [[''], [''], [''], [], [], [], [], ['_TAG3'], [], [], [], [], [], [], [], [], [], [], ['', '']] + assert tokens == [[[['']]], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] + assert tags == ['', '', '', '', '', '', '', '_TAG3', '', '', '', '', '', '', '', '', '', '', ''] elif file_name == 'tokenized_untagged.txt': - assert file_text.tokens_multilevel == [[], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] - assert file_text.tags == [[] for _ in file_text.get_tokens_flat()] + assert tokens == [[], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] + assert tags == [None] * file_text.num_tokens elif file_name == 'tokenized_tagged.txt': - assert file_text.tokens_multilevel == [[[['']]], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] - assert file_text.tags == [[''], [''], [''], [], [], [], [], ['_TAG3RunningToken_TAG3'], [], [], [], [], [], [], [], [], [], [], ['', '']] + assert tokens == [[[['']]], [], [[['This', 'is', 'the', 'first', 'sentence', '.']], [['This', 'is', 'the', 'second', 'sentence', '.']]], [], [], [[['This', 'is', 'the', 'third', 'sentence', '.']]], [], []] + assert tags == ['', '', '', '', '', '', '', '_TAG3RunningToken_TAG3', '', '', '', '', '', '', '', '', '', '', ''] - assert len(file_text.get_tokens_flat()) == len(file_text.tags) + assert len(tags) == file_text.num_tokens def test_file_area_misc(): wl_test_init.clean_import_caches() diff --git a/tests/tests_nlp/test_dependency_parsing.py b/tests/tests_nlp/test_dependency_parsing.py index d745d6476..a29af6670 100644 --- a/tests/tests_nlp/test_dependency_parsing.py +++ b/tests/tests_nlp/test_dependency_parsing.py @@ -19,7 +19,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import wl_dependency_parsing, wl_word_tokenization +from wordless.wl_nlp import wl_dependency_parsing, wl_texts, wl_word_tokenization main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') @@ -32,69 +32,94 @@ @pytest.mark.parametrize('lang, dependency_parser', test_dependency_parsers) def test_dependency_parse(lang, dependency_parser): + test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') + + tokens = wl_word_tokenization.wl_word_tokenize_flat( + main, + text = test_sentence, + lang = lang + ) + + wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, tokens, '') + +def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, tokens, results): # Untokenized - dependencies = wl_dependency_parsing.wl_dependency_parse( + tokens_untokenized = wl_dependency_parsing.wl_dependency_parse( main, - inputs = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), + inputs = test_sentence, lang = lang, dependency_parser = dependency_parser ) + dependencies_untokenized = [ + (str(token), str(token.head), token.dependency_relation, token.dependency_len) + for token in tokens_untokenized + ] + + print(f'{lang} / {dependency_parser}:') + print(f'{dependencies_untokenized}\n') # Tokenized - tokens = wl_word_tokenization.wl_word_tokenize_flat( - main, - text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), - lang = lang - ) - dependencies_tokenized = wl_dependency_parsing.wl_dependency_parse( + tokens_tokenized = wl_dependency_parsing.wl_dependency_parse( main, inputs = tokens, lang = lang, dependency_parser = dependency_parser ) + dependencies_tokenized = [ + (str(token), str(token.head), token.dependency_relation, token.dependency_len) + for token in tokens_tokenized + ] - print(f'{lang} / {dependency_parser}:') - print(f'{dependencies}\n') + assert dependencies_untokenized == results # Check for empty dependencies - assert dependencies + assert dependencies_untokenized assert dependencies_tokenized - assert all(dependencies) + assert all(dependencies_untokenized) assert all(dependencies_tokenized) - for dependency in dependencies + dependencies_tokenized: + for dependency in dependencies_untokenized + dependencies_tokenized: assert len(dependency) == 4 # Tokenization should not be modified assert len(tokens) == len(dependencies_tokenized) - # Tagged texts + # Tagged main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - dependencies_tokenized_tagged = wl_dependency_parsing.wl_dependency_parse( + tokens_tagged = wl_dependency_parsing.wl_dependency_parse( main, - inputs = [token + '_TEST' for token in tokens], + inputs = [wl_texts.Wl_Token(token, tag = '_TEST') for token in tokens], lang = lang, - dependency_parser = dependency_parser, - tagged = True + dependency_parser = dependency_parser ) - - dependencies_tokenized = [ - (child + '_TEST', head + '_TEST', dependency_relation, dependency_dist) - for child, head, dependency_relation, dependency_dist in dependencies_tokenized + dependencies_tagged = [ + (str(token), str(token.head), token.dependency_relation, token.dependency_len) + for token in tokens_tagged ] - assert dependencies_tokenized_tagged == dependencies_tokenized + assert dependencies_tagged == dependencies_tokenized + + # Long + tokens_long = wl_dependency_parsing.wl_dependency_parse( + main, + inputs = wl_texts.to_tokens(wl_test_lang_examples.TOKENS_LONG, lang = lang), + lang = lang, + dependency_parser = dependency_parser + ) + + assert [str(token) for token in tokens_long] == wl_test_lang_examples.TOKENS_LONG - # Long texts - dependencies_tokenized_long = wl_dependency_parsing.wl_dependency_parse( + # Parsed + heads_orig = ['test_head'] + tokens_parsed = wl_dependency_parsing.wl_dependency_parse( main, - inputs = [str(i) for i in range(101) for j in range(10)], + inputs = wl_texts.to_tokens(['test'], lang = lang, heads = heads_orig), lang = lang, dependency_parser = dependency_parser ) - assert [dependency[0] for dependency in dependencies_tokenized_long] == [str(i) for i in range(101) for j in range(10)] + assert [str(token.head) for token in tokens_parsed] == heads_orig if __name__ == '__main__': for lang, dependency_parser in test_dependency_parsers: diff --git a/tests/tests_nlp/test_lemmatization.py b/tests/tests_nlp/test_lemmatization.py index 3a4a687d4..072efda6f 100644 --- a/tests/tests_nlp/test_lemmatization.py +++ b/tests/tests_nlp/test_lemmatization.py @@ -19,7 +19,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import wl_lemmatization, wl_word_tokenization +from wordless.wl_nlp import wl_lemmatization, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_misc _, is_macos, _ = wl_misc.check_os() @@ -53,238 +53,277 @@ @pytest.mark.parametrize('lang, lemmatizer', test_lemmatizers) def test_lemmatize(lang, lemmatizer): + tests_lang_util_skipped = False + test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') + + tokens = wl_word_tokenization.wl_word_tokenize_flat( + main, + text = test_sentence, + lang = lang + ) + + match lang: + case 'sqi': + results = ['gjuhë', 'shqip', '(', 'ose', 'thjesht', 'shqipe', ')', 'jam', 'gjuhë', 'jap', 'degë', 'ai', 'veçantë', 'ai', 'familje', 'indo-evropiane', 'që', 'flitet', 'nga', 'rreth', '7-10', 'milionë', 'njeri', 'në', 'botë', ',', '[', '1', ']', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'jap', 'Maqedoninë', 'ai', 'veri', ',', 'por', 'edhe', 'në', 'zonë', 'ti', 'tjera', 'ti', 'Evropës', 'Juglindore', 'ku', 'kam', 'një', 'popullsi', 'shqiptar', ',', 'duk', 'përfshij', 'mal', 'ai', 'Zi', 'jap', 'luginë', 'ai', 'Preshevës', '.'] + case 'hye': + results = ['հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն։', 'գրաբար', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը։'] + case 'ast': + results = ["L'asturianu", 'ser', 'un', 'llingua', 'romance', 'propiu', "d'Asturies", ',', '[', '1', ']', 'perteneciente', 'al', 'subgrupu', 'asturllionés', '.'] + case 'ben': + results = ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।'] + case 'bul': + results = ['бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от', 'група', 'на', 'южнославянскит', 'език', ',', 'като', 'образувам', 'негова', 'източен', 'подгрупа', '.'] + case 'cat': + results = ['ell', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'el', 'illa', 'balear', ',', 'a', 'Andorra', ',', 'a', 'el', 'ciutat', 'de', "l'Alguer", 'i', 'tradicional', 'a', 'Catalunya', 'del', 'nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'pair', 'valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a', 'Catalunya', ',', 'ell', 'pair', 'valencià', '(', 'treure', "d'algunes", 'comarca', 'i', 'localitat', 'de', "l'interior", ')', ',', 'el', 'illa', 'balear', '(', 'on', 'també', 'rebre', 'ell', 'nòmer', 'de', 'mallorquí', ',', 'menorquí', ',', 'eivissenc', 'o', 'formenterer', 'segon', "l'illa", ')', ',', 'Andorra', ',', 'el', 'franjar', 'de', 'pondre', '(', 'a', "l'Aragó", ')', ',', 'el', 'ciutat', 'de', "l'Alguer", '(', 'a', "l'illa", 'de', 'Sardenya', ')', ',', 'el', 'Catalunya', 'del', 'nord', ',', '[', '8', ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'habitar', 'per', 'poblador', 'valencià', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'comunitat', 'arreu', 'del', 'món', '(', 'entrar', 'el', 'qual', 'destacar', 'el', 'de', "l'Argentina", ',', 'amb', '200.000', 'parlant', ')', '.', '[', '11', ']'] + case 'hrv': + results = ['hrvatski', 'jezik', '(', 'ISO', '639-3', ':', 'hrv', ')', 'skupni', 'ju', 'naziv', 'за', 'nacionalni', 'standardni', 'jezik', 'Hrvat', ',', 'ti', 'за', 'skup', 'narječje', 'i', 'govora', 'kojima', 'govoriti', 'ili', 'biti', 'nekada', 'govoriti', 'Hrvat', '.'] + case 'ces': + match lemmatizer: + case 'simplemma_ces': + results = ['čeština', 'neboli', 'český', 'jazyk', 'být', 'západoslovanský', 'jazyk', ',', 'nejbližší', 'slovenština', ',', 'poté', 'lužické', 'srbštině', 'a', 'polština', '.'] + case 'spacy_ces': + results = ['Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský', 'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.'] + case _: + tests_lang_util_skipped = True + case 'dan': + results = ['dansk', 'være', 'en', 'østnordisk', 'sprog', 'indenfor', 'den', 'germansk', 'gren', 'af', 'den', 'indoeuropæiske', 'sprogfamilie', '.'] + case 'nld': + results = ['het', 'Nederlands', 'zijn', 'een', 'west-germaans', 'talen', ',', 'de', 'veel', 'gebruiken', 'talen', 'in', 'Nederland', 'en', 'België', ',', 'de', 'officieel', 'talen', 'van', 'Suriname', 'en', 'een', 'van', 'de', 'drie', 'officieel', 'tale', 'van', 'België', '.'] + case 'enm': + results = ['Forrþrihht', 'anan', 'see', 'timen', 'comm', 'þatt', 'eure', 'Drihhtin', 'wollde', 'been', 'borenn', 'in', 'þiss', 'middellærd', 'forr', 'all', 'mannkinne', 'neden', 'hem', 'chæs', 'him', 'sonne', 'kinnessmenn', 'all', 'swillke', 'summ', 'hem', 'wollde', 'and', 'whær', 'hem', 'wollde', 'borenn', 'been', 'hem', 'chæs', 'all', 'att', 'his', 'willen', '.'] + case 'eng_gb' | 'eng_us': + match lemmatizer: + case 'nltk_wordnet': + results = ['English', 'be', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] + case 'simplemma_eng': + results = ['English', 'be', 'a', 'west', 'germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] + case _: + tests_lang_util_skipped = True + case 'est': + results = ['Eesti', 'keel', 'olema', 'kaks', 'suurem', 'murd', '(', 'põhi', 'ja', 'lõuna', ')', ',', 'mõni', 'käsitlus', 'eristama', 'ka', 'kirderannik', 'murre', 'eraldi', 'murderühmana', '.'] + case 'fin': + results = ['Suomi', 'kieli', 'eli', 'suomi', 'olla', 'uralilainen', 'kieli', 'itämerensuomalainen', 'ryhmä', 'kuuluva', 'kieli', ',', 'jota', 'puhua', 'pääosa', 'Suomalainen', '.'] + case 'fra': + results = ['le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', 'dont', 'le', 'locuteurs', 'être', 'appelé', 'francophone', '.'] + case 'glg': + results = ['O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', '[', '1', ']', ')', 'ser', 'un', 'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de', 'lingua', 'románico', '.'] + case 'kat': + results = ['ქართული', 'ენა', '—', 'იბერიულ-კავკასიურ', 'ენათა', 'ოჯახის', 'ქართველურ', 'ენათა', 'ჯგუფი', 'ენა', '.'] + case 'deu_at' | 'deu_de' | 'deu_ch': + results = ['der', 'Deutscher', 'sein', 'ein', 'plurizentrische', 'Sprache', ',', 'enthalten', 'also', 'mehrere', 'Standardvarietät', 'in', 'verschieden', 'Region', '.'] + case 'grc': + results = ['ἔρχομαι', 'δέ', 'ὁ', 'δύο', 'ἄγγελος', 'εἰς', 'Σόδομα', 'ἑσπέρα', '·', 'Λὼτ', 'δέ', 'κάθημαι', 'παρά', 'ὁ', 'πύλη', 'Σοδόμων', '.', 'εἶδον', 'δέ', 'Λὼτ', 'ἐξανίστημι', 'εἰς', 'συνάντησιν', 'αὐτός', 'καί', 'προσκυνέω', 'ὁ', 'πρόσωπον', 'ἐπί', 'ὁ', 'γῆ'] + case 'ell': + results = ['ο', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια', '[', '9', ']', 'και', 'αποτελώ', 'ο', 'μοναδικός', 'μέλος', 'ο', 'ελληνικός', 'κλάδος', ',', 'ενώ', 'είμαι', 'ο', 'επίσημος', 'γλώσσα', 'ο', 'Ελλάδα', 'και', 'ο', 'Κύπρος', '.'] + case 'hin': + results = ['हिंदी', 'जिसके', 'मानकीकृत', 'रूप', 'को', 'मानक', 'हिंदी', 'कहना', 'जाना', 'होना', ',', 'विश्व', 'का', 'एक', 'प्रमुख', 'भाषा', 'होना', 'और', 'भारत', 'का', 'एक', 'राजभाषा', 'है।'] + case 'hun': + match lemmatizer: + case 'simplemma_hun': + results = ['a', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelve', 'köz', 'tartozik', 'ugor', 'nyelve', 'egyik', '.'] + case 'spacy_hun': + results = ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv', 'egyik', '.'] + case _: + tests_lang_util_skipped = True + case 'isl': + results = ['íslenskur', 'vera', 'vesturnorrænt', ',', 'germanskur', 'og', 'indóevrópskur', 'tungumál', 'semja', 'vera', 'einkum', 'tala', 'og', 'rita', 'ær', 'Ísland', 'og', 'vera', 'móðurmál', 'langflestra', 'Íslendinga.', '[', '5', ']'] + case 'ind': + match lemmatizer: + case 'simplemma_ind': + results = ['bahasa', 'Indonesia', 'adalah', 'bahasa', 'nasional', 'dan', 'resmi', 'di', 'seluruh', 'wilayah', 'Indonesia', '.'] + case 'spacy_ind': + results = ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'nasional', 'dan', 'resmi', 'di', 'seluruh', 'wilayah', 'Indonesia', '.'] + case _: + tests_lang_util_skipped = True + case 'gle': + match lemmatizer: + case 'simplemma_gle': + results = ['Is', 'ceann', 'de', 'na', 'teangach', 'ceilteach', 'í', 'an', 'gaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'de', 'na', 'trí', 'ceann', 'de', 'teangach', 'ceilteach', 'ar', 'a', 'tabhair', 'na', 'teangach', 'gaelach', '(', 'Gaeilge', ',', 'Gaeilge', 'manainn', 'agus', 'Gaeilge', 'na', 'hAlban', ')', 'go', 'áirithe', '.'] + case 'spacy_gle': + results = ['is', 'ceann', 'de', 'na', 'teangacha', 'ceilteacha', 'í', 'an', 'ghaeilge', '(', 'nó', 'gaeilge', 'na', 'héireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'de', 'na', 'trí', 'cinn', 'de', 'theangacha', 'ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'gaelacha', '(', 'gaeilge', ',', 'gaeilge', 'mhanann', 'agus', 'gaeilge', 'na', 'halban', ')', 'go', 'háirithe', '.'] + case _: + tests_lang_util_skipped = True + case 'ita': + results = ["L'italiano", '(', '[', 'itaˈljaːno', ']', '[', 'nota', '1', ']', 'ascoltaⓘ', ')', 'essere', 'uno', 'lingua', 'romanza', 'parlato', 'principalmente', 'in', 'Italia', '.'] + case 'jpn': + results = ['日本語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注釈', '2', ']', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だ', 'た', '国', '、', 'そして', '国外', '移民', 'や', '移住者', 'を', '含む', '日本人', '同士', 'の', '間', 'で', '使用', 'する', 'れる', 'て', 'いる', '言語', '。'] + case 'kor': + results = ['한국어', '(', '韓國語', ')', '는', '대한민+국과', '조선민주주의인민공화국+의', '공용어이다', '.'] + case 'lat': + results = ['lingua', 'Latinus', ',', '[', '1', ']', 'sive', 'sermo', 'Latinus', ',', '[', '2', ']', 'sum', 'lingua', 'indoeuropaeus', 'qui', 'primus', 'Latinus', 'universus', 'et', 'Romanus', 'antiquus', 'in', 'primus', 'loquor', 'quamobrem', 'interdum', 'etiam', 'lingua', 'Latius', '[', '3', ']', '(', 'in', 'Latium', 'enim', 'suetus', ')', 'et', 'lingua', 'Romanus', '[', '4', ']', '(', 'nam', 'imperium', 'Romanus', 'sermo', 'sollemne', ')', 'appello', '.'] + case 'lav': + results = ['latviete', 'valoda', 'būt', 'dzimta', 'valoda', 'apmērs', '1,5', 'miljons', 'cilvēks', ',', 'galvenokārt', 'Latvija', ',', 'kur', 'tā', 'būt', 'vienīgs', 'valsts', 'valoda.', '[', '1', ']', '[', '3', ']'] + case 'lit': + results = ['lietuvė', 'kalba', '–', 'ižti', 'baltas', 'prokalbė', 'kilęs', 'lietuvė', 'tauta', 'kalba', ',', 'kurti', 'Lietuva', 'irti', 'valstybinis', ',', 'o', 'Europa', 'sąjunga', '–', 'Viena', 'ižti', 'oficialus', 'kalbus', '.'] + case 'ltz': + match lemmatizer: + case 'simplemma_ltz': + results = ["D'Lëtzebuergesch", 'ginn', 'an', 'der', 'däitsch', 'Dialektologie', 'als', 'een', 'westgermanesch', ',', 'mëtteldäitsch', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'muselfränkesch', 'gehéiert', '.'] + case 'spacy_ltz': + results = ["D'", 'Lëtzebuergesch', 'ginn', 'an', 'der', 'däitsch', 'Dialektologie', 'als', 'een', 'westgermanesch', ',', 'mëtteldäitsch', 'Dialekt', 'aklasséieren', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéieren', '.'] + case _: + tests_lang_util_skipped = True + case 'mkd': + results = ['македонски', 'јазик', '—', 'јужнословенски', 'јазик', ',', 'дел', 'од', 'група', 'на', 'словенски', 'јазик', 'од', 'јазичното', 'семејство', 'на', 'индоевропски', 'јазик', '.'] + case 'msa': + results = ['bahasa', 'Melayu', '(', 'tulisan', 'Jawi', ':', 'bahasa', 'Melayu', ';', 'rencong', ':', 'ꤷꥁꤼ', 'ꤸꥍꤾꤿꥈ', ')', 'ialah', 'salah', 'ساتو', 'daripada', 'bahasa', 'Melayu-Polinesia', 'di', 'bawah', 'keluarga', 'bahasa', 'Austronesia', ',', 'hiang', 'merupakan', 'bahasa', 'rasmi', 'di', 'Brunei', ',', 'Indonesia', ',', 'Malaysia', 'دان', 'Singapura', ',', 'serta', 'dituturkan', 'di', 'timur', 'Leste', 'دان', 'sebahagian', 'wilayah', 'di', 'Kemboja', ',', 'Filipina', 'دان', 'Thailand', '.'] + case 'glv': + results = ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.'] + case 'nob': + results = ['bokmål', 'være', 'enn', 'av', 'to', 'offisiell', 'målform', 'av', 'norsk', 'skriftspråk', ',', 'hvorav', 'den', 'annen', 'være', 'nynorsk', '.'] + case 'nno': + results = ['nynorsk', ',', 'føra', '1929', 'offisiell', 'kall', 'landsmål', ',', 'vera', 'sidan', 'jamstillingsvedtaket', 'av', '12', '.', 'mai', '1885', 'ein', 'av', 'den', 'to', 'offisiell', 'målformene', 'av', 'norsk', ';', 'den', 'annan', 'forme', 'vera', 'bokmål', '.'] + case 'fas': + match lemmatizer: + case 'simplemma_fas': + results = ['فارسی', 'یا', 'پارسی', 'یک', 'زبان', 'ایرانی', 'غربی', 'از', 'زیرگروه', 'ایرانی', 'شاخهٔ', 'هندوایرانیِ', 'خانوادهٔ', 'زبان\u200cهای', 'هندواروپایی', 'است', 'که', 'در', 'کشورهای', 'ایران،', 'افغانستان،', 'تاجیکستان،', 'ازبکستان،', 'پاکستان،', 'عراق،', 'ترکمنستان', 'را', 'آذربایجان', 'به', 'آن', 'سخن', 'می\u200cگویند', '.'] + case 'spacy_fas': + results = ['فارسی', 'یا', 'پارسی', 'یک', 'زبان', 'ایرانی', 'غربی', 'از', 'زیرگروه', 'ایرانی', 'شاخهٔ', 'هندوایرانیِ', 'خانوادهٔ', 'زبان\u200cهای', 'هندواروپایی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', 'تاجیکستان', '،', 'ازبکستان', '،', 'پاکستان', '،', 'عراق', '،', 'ترکمنستان', 'و', 'آذربایجان', 'به', 'آن', 'سخن', 'می\u200cگویند', '.'] + case _: + tests_lang_util_skipped = True + case 'pol': + results = ['język', 'polski', ',', 'polszczyzna', '–', 'język', 'z', 'grupa', 'zachodniosłowiański', '(', 'do', 'który', 'należeć', 'również', 'czeski', ',', 'kaszubski', ',', 'słowacki', 'i', 'język', 'łużycki', ')', ',', 'stanowić', 'część', 'rodzina', 'indoeuropejski', '.'] + case 'por_br' | 'por_pt': + results = ['o', 'língua', 'portuguesar', ',', 'também', 'designado', 'português', ',', 'ser', 'umar', 'língua', 'indo-europeu', 'românico', 'flexivo', 'ocidental', 'originado', 'o', 'galego-português', 'falar', 'o', 'reino', 'da', 'galiza', 'e', 'o', 'norte', 'de', 'portugal', '.'] + case 'ron': + results = ['limbă', 'român', 'fi', 'el', 'limbă', 'indo-european', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.'] + case 'rus': + match lemmatizer: + case 'simplemma_rus': + results = ['Ру́сский', 'язы́к', '(', 'МФА', ':', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянский', 'группа', 'славянский', 'ветвь', 'индоевропейский', 'языковый', 'семья', ',', 'национальный', 'язык', 'русский', 'народ', '.'] + case 'pymorphy3_morphological_analyzer': + results = ['ру́сский', 'язы́к', '(', 'мфа', ':', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянский', 'группа', 'славянский', 'ветвь', 'индоевропейский', 'языковой', 'семья', ',', 'национальный', 'язык', 'русский', 'народ', '.'] + case _: + tests_lang_util_skipped = True + case 'sme': + results = ['davvisámegiella', 'gullát', 'sámegiella', 'oarjesámegielaid', 'davvejovkui', 'ovttastit', 'julev-', 'ja', 'bihtánsámegielain', '.'] + case 'gla': + results = ["'S", 'i', 'cànan', 'dùthchasach', 'na', 'h-alba', 'a', 'th', "'", 'anns', 'a', "'", 'gàidhlig', '.'] + case 'srp_cyrl': + results = ['Српски', 'језик', 'бити', 'званичан', 'у', 'Србији', ',', 'Босни', 'и', 'Херцеговини', 'и', 'Црној', 'Гори', 'и', 'говорити', 'он', 'око', '12', 'милион', 'људи.[13', ']'] + case 'srp_latn': + results = ['srpski', 'jezik', 'ju', 'zvaničan', 'u', 'Srbija', ',', 'Bosna', 'i', 'Hercegovina', 'i', 'crn', 'gora', 'i', 'govoriti', 'ih', 'oko', '12', 'milion', 'ljudi.', '[', '13', ']'] + case 'slk': + results = ['slovenčina', 'byť', 'oficiálne', 'úradný', 'jazyk', 'Slovensko', ',', 'vojvodiny', 'a', 'od', '1', '.', 'máj', '2004', 'jeden', 'z', 'jazyk', 'európsky', 'únia', '.'] + case 'slv': + results = ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'on', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govorilo', 'Slovenec', '.'] + case 'spa': + results = ['el', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', ',', 'perteneciente', 'a', 'el', 'familia', 'de', 'lengua', 'indoeuropeo', '.'] + case 'swa': + results = ['Kiswahili', 'ni', 'lugha', 'ya', 'Kibantu', 'enye', 'msamiati', 'ingi', 'ya', 'Kiarabu', '(', '35', '%', '[', '1', ']', ')', ',', 'laki', 'sasa', 'ya', 'Kiingereza', 'pia', '(', '10', '%', ')', ',', 'inayozungumzwa', 'katika', 'eneo', 'kubwa', 'la', 'Afrika', 'ya', 'mashariki', '.'] + case 'swe': + results = ['svensk', '(', 'svensk', '(', 'info', ')', ')', 'ära', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'ha', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'den', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.'] + case 'tgl': + match lemmatizer: + case 'simplemma_tgl': + results = ['Ang', 'wikang', 'Tagalog', '[', '1', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜆᜄᜎᜓ', ')', ',', 'o', 'ang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pinakaginagamit', 'na', 'wikain', 'ng', 'Pilipinas', '.'] + case 'spacy_tgl': + results = ['Ang', 'wikang', 'Tagalog[1', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜆᜄᜎᜓ', ')', ',', 'o', 'ang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pinakaginagamit', 'na', 'wika', 'ng', 'Pilipinas', '.'] + case _: + tests_lang_util_skipped = True + case 'bod': + results = ['བོད་', 'གི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'ཉེ་འཁོར་', 'གི་', 'ས་ཁུལ་', 'བལ་ཡུལ་', '།', 'འབྲུག་', 'དང་', 'འབྲས་ལྗོངས་', '།'] + case 'tur': + match lemmatizer: + case 'simplemma_tur': + results = ['türkçe', 'ya', 'da', 'Türk', 'dil', ',', 'güneydoğu', 'avrupa', 've', 'batı', 'asya', 'konuş', ',', 'Türk', 'dil', 'dil', 'aile', 'ait', 'son', 'ekle', 'bir', 'dil.', '[', '12', ']'] + case 'spacy_tur': + results = ['Türkçe', 'ya', 'da', 'Türk', 'dil', ',', 'Güneydoğu', 'Avrupa', 've', 'Batı', "Asya'da", 'konuş', ',', 'Türk', 'dil', 'dil', 'aile', 'ait', 'son', 'ekle', 'bir', 'dil.[12', ']'] + case _: + tests_lang_util_skipped = True + case 'ukr': + match lemmatizer: + case 'pymorphy3_morphological_analyzer': + results = ['украї́нський', 'мо́вий', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'ру́ський', '[', '10', ']', '[', '11', ']', '[', '12', ']', '[', '*', '1', ']', ')', '—', 'національний', 'мова', 'українець', '.'] + case 'simplemma_ukr': + results = ['Українськ', 'мо́ва', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'руський', '[', '10', ']', '[', '11', ']', '[', '12', ']', '[', '*', '1', ']', ')', '—', 'національний', 'мова', 'українець', '.'] + case _: + tests_lang_util_skipped = True + case 'urd': + results = ['اُردُو[8', ']', 'برصغیر', 'کم', 'معیاری', 'زبان', 'میں', 'سے', 'ایک', 'ہونا', '۔'] + case 'cym': + results = ['yn', 'cyfrifiad', 'yr', 'tu', '(', '2011', ')', ',', 'darganfod', 'bodio', '19', '%', '(', '562,000', ')', 'prpers', 'preswylwr', 'cymru', '(', 'tair', 'blwydd', 'a', 'trosodd', ')', 'bod', 'gallu', 'siarad', 'cymraeg', '.'] + case _: + raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + + if tests_lang_util_skipped: + raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(lemmatizer) + + wl_test_lemmatize_models(lang, lemmatizer, test_sentence, tokens, results) + +def wl_test_lemmatize_models(lang, lemmatizer, test_sentence, tokens, results, lang_exceptions = None): + lang_exceptions = lang_exceptions or [] + # Untokenized - lemmas = wl_lemmatization.wl_lemmatize( + tokens_untokenized = wl_lemmatization.wl_lemmatize( main, - inputs = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), + inputs = test_sentence, lang = lang, lemmatizer = lemmatizer ) + lemmas_untokenized = [token.lemma for token in tokens_untokenized] + + print(f'{lang} / {lemmatizer}:') + print(f'{lemmas_untokenized}\n') # Tokenized - tokens = wl_word_tokenization.wl_word_tokenize_flat( - main, - text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), - lang = lang - ) - lemmas_tokenized = wl_lemmatization.wl_lemmatize( + tokens_tokenized = wl_lemmatization.wl_lemmatize( main, inputs = tokens, lang = lang, lemmatizer = lemmatizer ) + lemmas_tokenized = [token.lemma for token in tokens_tokenized] - print(f'{lang} / {lemmatizer}:') - print(f'{lemmas}\n') + assert lemmas_untokenized == results # Check for empty lemmas - assert lemmas + assert lemmas_untokenized assert lemmas_tokenized - assert all(lemmas) + assert all(lemmas_untokenized) assert all(lemmas_tokenized) # Tokenization should not be modified assert len(tokens) == len(lemmas_tokenized) - # Tagged texts + # Tagged main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - lemmas_tokenized_tagged = wl_lemmatization.wl_lemmatize( + tokens_tagged = wl_lemmatization.wl_lemmatize( main, - inputs = [token + '_TEST' for token in tokens], + inputs = [wl_texts.Wl_Token(token, tag = '_TEST') for token in tokens], lang = lang, - lemmatizer = lemmatizer, - tagged = True + lemmatizer = lemmatizer ) + lemmas_tagged = [token.lemma for token in tokens_tagged] - assert lemmas_tokenized_tagged == [lemma + '_TEST' for lemma in lemmas_tokenized] + assert lemmas_tagged == lemmas_tokenized - # Long texts - lemmas_tokenized_long = wl_lemmatization.wl_lemmatize( + # Long + tokens_long = wl_lemmatization.wl_lemmatize( main, - inputs = [str(i) for i in range(101) for j in range(10)], + inputs = wl_texts.to_tokens(wl_test_lang_examples.TOKENS_LONG, lang = lang), lang = lang, lemmatizer = lemmatizer ) + lemmas_long = [token.lemma for token in tokens_long] - assert lemmas_tokenized_long == [str(i) for i in range(101) for j in range(10)] - - tests_lang_util_skipped = False - - if lang == 'sqi': - assert lemmas == ['gjuhë', 'shqip', '(', 'ose', 'thjesht', 'shqipe', ')', 'jam', 'gjuhë', 'jap', 'degë', 'ai', 'veçantë', 'ai', 'familje', 'indo-evropiane', 'që', 'flitet', 'nga', 'rreth', '7-10', 'milionë', 'njeri', 'në', 'botë', ',', '[', '1', ']', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'jap', 'Maqedoninë', 'ai', 'veri', ',', 'por', 'edhe', 'në', 'zonë', 'ti', 'tjera', 'ti', 'Evropës', 'Juglindore', 'ku', 'kam', 'një', 'popullsi', 'shqiptar', ',', 'duk', 'përfshij', 'mal', 'ai', 'Zi', 'jap', 'luginë', 'ai', 'Preshevës', '.'] - elif lang == 'hye': - assert lemmas == ['հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն։', 'գրաբար', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը։'] - elif lang == 'ast': - assert lemmas == ["L'asturianu", 'ser', 'un', 'llingua', 'romance', 'propiu', "d'Asturies", ',', '[', '1', ']', 'perteneciente', 'al', 'subgrupu', 'asturllionés', '.'] - elif lang == 'ben': - assert lemmas == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।'] - elif lang == 'bul': - assert lemmas == ['бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от', 'група', 'на', 'южнославянскит', 'език', ',', 'като', 'образувам', 'негова', 'източен', 'подгрупа', '.'] - elif lang == 'cat': - assert lemmas == ['ell', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'el', 'illa', 'balear', ',', 'a', 'Andorra', ',', 'a', 'el', 'ciutat', 'de', "l'Alguer", 'i', 'tradicional', 'a', 'Catalunya', 'del', 'nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'pair', 'valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a', 'Catalunya', ',', 'ell', 'pair', 'valencià', '(', 'treure', "d'algunes", 'comarca', 'i', 'localitat', 'de', "l'interior", ')', ',', 'el', 'illa', 'balear', '(', 'on', 'també', 'rebre', 'ell', 'nòmer', 'de', 'mallorquí', ',', 'menorquí', ',', 'eivissenc', 'o', 'formenterer', 'segon', "l'illa", ')', ',', 'Andorra', ',', 'el', 'franjar', 'de', 'pondre', '(', 'a', "l'Aragó", ')', ',', 'el', 'ciutat', 'de', "l'Alguer", '(', 'a', "l'illa", 'de', 'Sardenya', ')', ',', 'el', 'Catalunya', 'del', 'nord', ',', '[', '8', ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'habitar', 'per', 'poblador', 'valencià', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'comunitat', 'arreu', 'del', 'món', '(', 'entrar', 'el', 'qual', 'destacar', 'el', 'de', "l'Argentina", ',', 'amb', '200.000', 'parlant', ')', '.', '[', '11', ']'] - elif lang == 'hrv': - assert lemmas == ['hrvatski', 'jezik', '(', 'ISO', '639-3', ':', 'hrv', ')', 'skupni', 'ju', 'naziv', 'за', 'nacionalni', 'standardni', 'jezik', 'Hrvat', ',', 'ti', 'за', 'skup', 'narječje', 'i', 'govora', 'kojima', 'govoriti', 'ili', 'biti', 'nekada', 'govoriti', 'Hrvat', '.'] - elif lang == 'ces': - if lemmatizer == 'simplemma_ces': - assert lemmas == ['čeština', 'neboli', 'český', 'jazyk', 'být', 'západoslovanský', 'jazyk', ',', 'nejbližší', 'slovenština', ',', 'poté', 'lužické', 'srbštině', 'a', 'polština', '.'] - elif lemmatizer == 'spacy_ces': - assert lemmas == ['Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský', 'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'dan': - assert lemmas == ['dansk', 'være', 'en', 'østnordisk', 'sprog', 'indenfor', 'den', 'germansk', 'gren', 'af', 'den', 'indoeuropæiske', 'sprogfamilie', '.'] - elif lang == 'nld': - assert lemmas == ['het', 'Nederlands', 'zijn', 'een', 'west-germaans', 'talen', ',', 'de', 'veel', 'gebruiken', 'talen', 'in', 'Nederland', 'en', 'België', ',', 'de', 'officieel', 'talen', 'van', 'Suriname', 'en', 'een', 'van', 'de', 'drie', 'officieel', 'tale', 'van', 'België', '.'] - elif lang == 'enm': - assert lemmas == ['Forrþrihht', 'anan', 'see', 'timen', 'comm', 'þatt', 'eure', 'Drihhtin', 'wollde', 'been', 'borenn', 'in', 'þiss', 'middellærd', 'forr', 'all', 'mannkinne', 'neden', 'hem', 'chæs', 'him', 'sonne', 'kinnessmenn', 'all', 'swillke', 'summ', 'hem', 'wollde', 'and', 'whær', 'hem', 'wollde', 'borenn', 'been', 'hem', 'chæs', 'all', 'att', 'his', 'willen', '.'] - elif lang.startswith('eng_'): - if lemmatizer == 'nltk_wordnet': - assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] - elif lemmatizer == 'simplemma_eng': - assert lemmas == ['English', 'be', 'a', 'west', 'germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'est': - assert lemmas == ['Eesti', 'keel', 'olema', 'kaks', 'suurem', 'murd', '(', 'põhi', 'ja', 'lõuna', ')', ',', 'mõni', 'käsitlus', 'eristama', 'ka', 'kirderannik', 'murre', 'eraldi', 'murderühmana', '.'] - elif lang == 'fin': - assert lemmas == ['Suomi', 'kieli', 'eli', 'suomi', 'olla', 'uralilainen', 'kieli', 'itämerensuomalainen', 'ryhmä', 'kuuluva', 'kieli', ',', 'jota', 'puhua', 'pääosa', 'Suomalainen', '.'] - elif lang == 'fra': - assert lemmas == ['le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', 'dont', 'le', 'locuteurs', 'être', 'appelé', 'francophone', '.'] - elif lang == 'glg': - assert lemmas == ['O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', '[', '1', ']', ')', 'ser', 'un', 'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de', 'lingua', 'románico', '.'] - elif lang == 'kat': - assert lemmas == ['ქართული', 'ენა', '—', 'იბერიულ-კავკასიურ', 'ენათა', 'ოჯახის', 'ქართველურ', 'ენათა', 'ჯგუფი', 'ენა', '.'] - elif lang.startswith('deu_'): - assert lemmas == ['der', 'Deutscher', 'sein', 'ein', 'plurizentrische', 'Sprache', ',', 'enthalten', 'also', 'mehrere', 'Standardvarietät', 'in', 'verschieden', 'Region', '.'] - elif lang == 'grc': - assert lemmas == ['ἔρχομαι', 'δέ', 'ὁ', 'δύο', 'ἄγγελος', 'εἰς', 'Σόδομα', 'ἑσπέρα', '·', 'Λὼτ', 'δέ', 'κάθημαι', 'παρά', 'ὁ', 'πύλη', 'Σοδόμων', '.', 'εἶδον', 'δέ', 'Λὼτ', 'ἐξανίστημι', 'εἰς', 'συνάντησιν', 'αὐτός', 'καί', 'προσκυνέω', 'ὁ', 'πρόσωπον', 'ἐπί', 'ὁ', 'γῆ'] - elif lang == 'ell': - assert lemmas == ['ο', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια', '[', '9', ']', 'και', 'αποτελώ', 'ο', 'μοναδικός', 'μέλος', 'ο', 'ελληνικός', 'κλάδος', ',', 'ενώ', 'είμαι', 'ο', 'επίσημος', 'γλώσσα', 'ο', 'Ελλάδα', 'και', 'ο', 'Κύπρος', '.'] - elif lang == 'hin': - assert lemmas == ['हिंदी', 'जिसके', 'मानकीकृत', 'रूप', 'को', 'मानक', 'हिंदी', 'कहना', 'जाना', 'होना', ',', 'विश्व', 'का', 'एक', 'प्रमुख', 'भाषा', 'होना', 'और', 'भारत', 'का', 'एक', 'राजभाषा', 'है।'] - elif lang == 'hun': - if lemmatizer == 'simplemma_hun': - assert lemmas == ['a', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelve', 'köz', 'tartozik', 'ugor', 'nyelve', 'egyik', '.'] - elif lemmatizer == 'spacy_hun': - assert lemmas == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv', 'egyik', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'isl': - assert lemmas == ['íslenskur', 'vera', 'vesturnorrænt', ',', 'germanskur', 'og', 'indóevrópskur', 'tungumál', 'semja', 'vera', 'einkum', 'tala', 'og', 'rita', 'ær', 'Ísland', 'og', 'vera', 'móðurmál', 'langflestra', 'Íslendinga.', '[', '5', ']'] - elif lang == 'ind': - if lemmatizer == 'simplemma_ind': - assert lemmas == ['bahasa', 'Indonesia', 'adalah', 'bahasa', 'nasional', 'dan', 'resmi', 'di', 'seluruh', 'wilayah', 'Indonesia', '.'] - elif lemmatizer == 'spacy_ind': - assert lemmas == ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'nasional', 'dan', 'resmi', 'di', 'seluruh', 'wilayah', 'Indonesia', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'gle': - if lemmatizer == 'simplemma_gle': - assert lemmas == ['Is', 'ceann', 'de', 'na', 'teangach', 'ceilteach', 'í', 'an', 'gaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'de', 'na', 'trí', 'ceann', 'de', 'teangach', 'ceilteach', 'ar', 'a', 'tabhair', 'na', 'teangach', 'gaelach', '(', 'Gaeilge', ',', 'Gaeilge', 'manainn', 'agus', 'Gaeilge', 'na', 'hAlban', ')', 'go', 'áirithe', '.'] - elif lemmatizer == 'spacy_gle': - assert lemmas == ['is', 'ceann', 'de', 'na', 'teangacha', 'ceilteacha', 'í', 'an', 'ghaeilge', '(', 'nó', 'gaeilge', 'na', 'héireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'de', 'na', 'trí', 'cinn', 'de', 'theangacha', 'ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'gaelacha', '(', 'gaeilge', ',', 'gaeilge', 'mhanann', 'agus', 'gaeilge', 'na', 'halban', ')', 'go', 'háirithe', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'ita': - assert lemmas == ["L'italiano", '(', '[', 'itaˈljaːno', ']', '[', 'nota', '1', ']', 'ascoltaⓘ', ')', 'essere', 'uno', 'lingua', 'romanza', 'parlato', 'principalmente', 'in', 'Italia', '.'] - elif lang == 'jpn': - assert lemmas == ['日本語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注釈', '2', ']', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だ', 'た', '国', '、', 'そして', '国外', '移民', 'や', '移住者', 'を', '含む', '日本人', '同士', 'の', '間', 'で', '使用', 'する', 'れる', 'て', 'いる', '言語', '。'] - elif lang == 'kor': - assert lemmas == ['한국어', '(', '韓國語', ')', '는', '대한민+국과', '조선민주주의인민공화국+의', '공용어이다', '.'] - elif lang == 'lat': - assert lemmas == ['lingua', 'Latinus', ',', '[', '1', ']', 'sive', 'sermo', 'Latinus', ',', '[', '2', ']', 'sum', 'lingua', 'indoeuropaeus', 'qui', 'primus', 'Latinus', 'universus', 'et', 'Romanus', 'antiquus', 'in', 'primus', 'loquor', 'quamobrem', 'interdum', 'etiam', 'lingua', 'Latius', '[', '3', ']', '(', 'in', 'Latium', 'enim', 'suetus', ')', 'et', 'lingua', 'Romanus', '[', '4', ']', '(', 'nam', 'imperium', 'Romanus', 'sermo', 'sollemne', ')', 'appello', '.'] - elif lang == 'lav': - assert lemmas == ['latviete', 'valoda', 'būt', 'dzimta', 'valoda', 'apmērs', '1,5', 'miljons', 'cilvēks', ',', 'galvenokārt', 'Latvija', ',', 'kur', 'tā', 'būt', 'vienīgs', 'valsts', 'valoda.', '[', '1', ']', '[', '3', ']'] - elif lang == 'lit': - assert lemmas == ['lietuvė', 'kalba', '–', 'ižti', 'baltas', 'prokalbė', 'kilęs', 'lietuvė', 'tauta', 'kalba', ',', 'kurti', 'Lietuva', 'irti', 'valstybinis', ',', 'o', 'Europa', 'sąjunga', '–', 'Viena', 'ižti', 'oficialus', 'kalbus', '.'] - elif lang == 'ltz': - if lemmatizer == 'simplemma_ltz': - assert lemmas == ["D'Lëtzebuergesch", 'ginn', 'an', 'der', 'däitsch', 'Dialektologie', 'als', 'een', 'westgermanesch', ',', 'mëtteldäitsch', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'muselfränkesch', 'gehéiert', '.'] - elif lemmatizer == 'spacy_ltz': - assert lemmas == ["D'", 'Lëtzebuergesch', 'ginn', 'an', 'der', 'däitsch', 'Dialektologie', 'als', 'een', 'westgermanesch', ',', 'mëtteldäitsch', 'Dialekt', 'aklasséieren', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéieren', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'mkd': - assert lemmas == ['македонски', 'јазик', '—', 'јужнословенски', 'јазик', ',', 'дел', 'од', 'група', 'на', 'словенски', 'јазик', 'од', 'јазичното', 'семејство', 'на', 'индоевропски', 'јазик', '.'] - elif lang == 'msa': - assert lemmas == ['bahasa', 'Melayu', '(', 'tulisan', 'Jawi', ':', 'bahasa', 'Melayu', ';', 'rencong', ':', 'ꤷꥁꤼ', 'ꤸꥍꤾꤿꥈ', ')', 'ialah', 'salah', 'ساتو', 'daripada', 'bahasa', 'Melayu-Polinesia', 'di', 'bawah', 'keluarga', 'bahasa', 'Austronesia', ',', 'hiang', 'merupakan', 'bahasa', 'rasmi', 'di', 'Brunei', ',', 'Indonesia', ',', 'Malaysia', 'دان', 'Singapura', ',', 'serta', 'dituturkan', 'di', 'timur', 'Leste', 'دان', 'sebahagian', 'wilayah', 'di', 'Kemboja', ',', 'Filipina', 'دان', 'Thailand', '.'] - elif lang == 'glv': - assert lemmas == ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.'] - elif lang == 'nob': - assert lemmas == ['bokmål', 'være', 'enn', 'av', 'to', 'offisiell', 'målform', 'av', 'norsk', 'skriftspråk', ',', 'hvorav', 'den', 'annen', 'være', 'nynorsk', '.'] - elif lang == 'nno': - assert lemmas == ['nynorsk', ',', 'føra', '1929', 'offisiell', 'kall', 'landsmål', ',', 'vera', 'sidan', 'jamstillingsvedtaket', 'av', '12', '.', 'mai', '1885', 'ein', 'av', 'den', 'to', 'offisiell', 'målformene', 'av', 'norsk', ';', 'den', 'annan', 'forme', 'vera', 'bokmål', '.'] - elif lang == 'fas': - if lemmatizer == 'simplemma_fas': - assert lemmas == ['فارسی', 'یا', 'پارسی', 'یک', 'زبان', 'ایرانی', 'غربی', 'از', 'زیرگروه', 'ایرانی', 'شاخهٔ', 'هندوایرانیِ', 'خانوادهٔ', 'زبان\u200cهای', 'هندواروپایی', 'است', 'که', 'در', 'کشورهای', 'ایران،', 'افغانستان،', 'تاجیکستان،', 'ازبکستان،', 'پاکستان،', 'عراق،', 'ترکمنستان', 'را', 'آذربایجان', 'به', 'آن', 'سخن', 'می\u200cگویند', '.'] - elif lemmatizer == 'spacy_fas': - assert lemmas == ['فارسی', 'یا', 'پارسی', 'یک', 'زبان', 'ایرانی', 'غربی', 'از', 'زیرگروه', 'ایرانی', 'شاخهٔ', 'هندوایرانیِ', 'خانوادهٔ', 'زبان\u200cهای', 'هندواروپایی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', 'تاجیکستان', '،', 'ازبکستان', '،', 'پاکستان', '،', 'عراق', '،', 'ترکمنستان', 'و', 'آذربایجان', 'به', 'آن', 'سخن', 'می\u200cگویند', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'pol': - assert lemmas == ['język', 'polski', ',', 'polszczyzna', '–', 'język', 'z', 'grupa', 'zachodniosłowiański', '(', 'do', 'który', 'należeć', 'również', 'czeski', ',', 'kaszubski', ',', 'słowacki', 'i', 'język', 'łużycki', ')', ',', 'stanowić', 'część', 'rodzina', 'indoeuropejski', '.'] - elif lang.startswith('por_'): - assert lemmas == ['o', 'língua', 'portuguesar', ',', 'também', 'designado', 'português', ',', 'ser', 'umar', 'língua', 'indo-europeu', 'românico', 'flexivo', 'ocidental', 'originado', 'o', 'galego-português', 'falar', 'o', 'reino', 'da', 'galiza', 'e', 'o', 'norte', 'de', 'portugal', '.'] - elif lang == 'ron': - assert lemmas == ['limbă', 'român', 'fi', 'el', 'limbă', 'indo-european', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.'] - elif lang == 'rus': - if lemmatizer == 'simplemma_rus': - assert lemmas == ['Ру́сский', 'язы́к', '(', 'МФА', ':', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянский', 'группа', 'славянский', 'ветвь', 'индоевропейский', 'языковый', 'семья', ',', 'национальный', 'язык', 'русский', 'народ', '.'] - elif lemmatizer == 'pymorphy3_morphological_analyzer': - assert lemmas == ['ру́сский', 'язы́к', '(', 'мфа', ':', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянский', 'группа', 'славянский', 'ветвь', 'индоевропейский', 'языковой', 'семья', ',', 'национальный', 'язык', 'русский', 'народ', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'sme': - assert lemmas == ['davvisámegiella', 'gullát', 'sámegiella', 'oarjesámegielaid', 'davvejovkui', 'ovttastit', 'julev-', 'ja', 'bihtánsámegielain', '.'] - elif lang == 'gla': - assert lemmas == ["'S", 'i', 'cànan', 'dùthchasach', 'na', 'h-alba', 'a', 'th', "'", 'anns', 'a', "'", 'gàidhlig', '.'] - elif lang == 'srp_cyrl': - assert lemmas == ['Српски', 'језик', 'бити', 'званичан', 'у', 'Србији', ',', 'Босни', 'и', 'Херцеговини', 'и', 'Црној', 'Гори', 'и', 'говорити', 'он', 'око', '12', 'милион', 'људи.[13', ']'] - elif lang == 'srp_latn': - assert lemmas == ['srpski', 'jezik', 'ju', 'zvaničan', 'u', 'Srbija', ',', 'Bosna', 'i', 'Hercegovina', 'i', 'crn', 'gora', 'i', 'govoriti', 'ih', 'oko', '12', 'milion', 'ljudi.', '[', '13', ']'] - elif lang == 'slk': - assert lemmas == ['slovenčina', 'byť', 'oficiálne', 'úradný', 'jazyk', 'Slovensko', ',', 'vojvodiny', 'a', 'od', '1', '.', 'máj', '2004', 'jeden', 'z', 'jazyk', 'európsky', 'únia', '.'] - elif lang == 'slv': - assert lemmas == ['slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'on', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govor', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govorilo', 'Slovenec', '.'] - elif lang == 'spa': - assert lemmas == ['el', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', ',', 'perteneciente', 'a', 'el', 'familia', 'de', 'lengua', 'indoeuropeo', '.'] - elif lang == 'swa': - assert lemmas == ['Kiswahili', 'ni', 'lugha', 'ya', 'Kibantu', 'enye', 'msamiati', 'ingi', 'ya', 'Kiarabu', '(', '35', '%', '[', '1', ']', ')', ',', 'laki', 'sasa', 'ya', 'Kiingereza', 'pia', '(', '10', '%', ')', ',', 'inayozungumzwa', 'katika', 'eneo', 'kubwa', 'la', 'Afrika', 'ya', 'mashariki', '.'] - elif lang == 'swe': - assert lemmas == ['svensk', '(', 'svensk', '(', 'info', ')', ')', 'ära', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'ha', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'den', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.'] - elif lang == 'tgl': - if lemmatizer == 'simplemma_tgl': - assert lemmas == ['Ang', 'wikang', 'Tagalog', '[', '1', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜆᜄᜎᜓ', ')', ',', 'o', 'ang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pinakaginagamit', 'na', 'wikain', 'ng', 'Pilipinas', '.'] - elif lemmatizer == 'spacy_tgl': - assert lemmas == ['Ang', 'wikang', 'Tagalog[1', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜆᜄᜎᜓ', ')', ',', 'o', 'ang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pinakaginagamit', 'na', 'wika', 'ng', 'Pilipinas', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'bod': - assert lemmas == ['བོད་', 'གི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'ཉེ་འཁོར་', 'གི་', 'ས་ཁུལ་', 'བལ་ཡུལ་', '།', 'འབྲུག་', 'དང་', 'འབྲས་ལྗོངས་', '།'] - elif lang == 'tur': - if lemmatizer == 'simplemma_tur': - assert lemmas == ['türkçe', 'ya', 'da', 'Türk', 'dil', ',', 'güneydoğu', 'avrupa', 've', 'batı', 'asya', 'konuş', ',', 'Türk', 'dil', 'dil', 'aile', 'ait', 'son', 'ekle', 'bir', 'dil.', '[', '12', ']'] - elif lemmatizer == 'spacy_tur': - assert lemmas == ['Türkçe', 'ya', 'da', 'Türk', 'dil', ',', 'Güneydoğu', 'Avrupa', 've', 'Batı', "Asya'da", 'konuş', ',', 'Türk', 'dil', 'dil', 'aile', 'ait', 'son', 'ekle', 'bir', 'dil.[12', ']'] - else: - tests_lang_util_skipped = True - elif lang == 'ukr': - if lemmatizer == 'pymorphy3_morphological_analyzer': - assert lemmas == ['украї́нський', 'мо́вий', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'ру́ський', '[', '10', ']', '[', '11', ']', '[', '12', ']', '[', '*', '1', ']', ')', '—', 'національний', 'мова', 'українець', '.'] - elif lemmatizer == 'simplemma_ukr': - assert lemmas == ['Українськ', 'мо́ва', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'руський', '[', '10', ']', '[', '11', ']', '[', '12', ']', '[', '*', '1', ']', ')', '—', 'національний', 'мова', 'українець', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'urd': - assert lemmas == ['اُردُو[8', ']', 'برصغیر', 'کم', 'معیاری', 'زبان', 'میں', 'سے', 'ایک', 'ہونا', '۔'] - elif lang == 'cym': - assert lemmas == ['yn', 'cyfrifiad', 'yr', 'tu', '(', '2011', ')', ',', 'darganfod', 'bodio', '19', '%', '(', '562,000', ')', 'prpers', 'preswylwr', 'cymru', '(', 'tair', 'blwydd', 'a', 'trosodd', ')', 'bod', 'gallu', 'siarad', 'cymraeg', '.'] + if lang in lang_exceptions: + assert len(lemmas_long) == 101 * 10 else: - raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + assert lemmas_long == wl_test_lang_examples.TOKENS_LONG - if tests_lang_util_skipped: - raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(lemmatizer) + # Lemmatized + lemmas_orig = ['tests'] + tokens_lemmatized = wl_lemmatization.wl_lemmatize( + main, + inputs = wl_texts.to_tokens(['test'], lang = lang, lemmas = lemmas_orig), + lang = lang, + lemmatizer = lemmatizer + ) + lemmas_lemmatized = [token.lemma for token in tokens_lemmatized] + + assert lemmas_lemmatized == lemmas_orig if __name__ == '__main__': for lang, lemmatizer in test_lemmatizers_local: diff --git a/tests/tests_nlp/test_matching.py b/tests/tests_nlp/test_matching.py index b00662779..cfa5df217 100644 --- a/tests/tests_nlp/test_matching.py +++ b/tests/tests_nlp/test_matching.py @@ -19,7 +19,7 @@ import re from tests import wl_test_init -from wordless.wl_nlp import wl_matching +from wordless.wl_nlp import wl_matching, wl_texts main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') @@ -200,171 +200,188 @@ def test_check_search_settings(): assert wl_matching.check_search_settings(TOKEN_SETTINGS_3, SEARCH_SETTINGS_5) == SEARCH_SETTINGS_5 assert wl_matching.check_search_settings(TOKEN_SETTINGS_3, SEARCH_SETTINGS_6) == SEARCH_SETTINGS_6 +def compare_tokens_matched(tokens_matched, tokens_expected): + tokens_matched = [token.display_text() for token in tokens_matched] + + print(f'Tokens matched: {sorted(tokens_matched)}') + print(f'Tokens expected: {sorted(tokens_expected)}\n') + + assert set(tokens_matched) == set(tokens_expected) + +def compare_ngrams_matched(ngrams_matched, ngrams_expected): + ngrams_matched = [tuple(token.display_text() for token in ngram) for ngram in ngrams_matched] + + print(f'Tokens matched: {sorted(ngrams_matched)}') + print(f'Tokens expected: {sorted(ngrams_expected)}\n') + + assert set(ngrams_matched) == set(ngrams_expected) + +def compare_context_matched(context_matched, context_expected): + context_matched = ( + {tuple(token.display_text() for token in ngram) for ngram in context_matched[0]}, + {tuple(token.display_text() for token in ngram) for ngram in context_matched[1]} + ) + + print(f'Tokens matched: {sorted(context_matched)}') + print(f'Tokens expected: {sorted(context_expected)}\n') + + assert context_matched == context_expected + def test_match_tokens(): - assert wl_matching.match_tokens( + compare_tokens_matched(wl_matching.match_tokens( main, search_terms = ['tAke'], - tokens = ['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'TaKEs', 'test'], + tokens = wl_texts.to_tokens( + ['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'TaKEs', 'test'], + lang = 'eng_us', + tags = ['', '', '', '', '', '', '_TAKE'] + ), lang = 'eng_us', - tagged = False, settings = init_search_settings() - ) == set(['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'TaKEs']) + ), ['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'TaKEs', 'test_TAKE']) - assert wl_matching.match_tokens( + compare_tokens_matched(wl_matching.match_tokens( main, search_terms = ['tAke'], - tokens = ['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'test'], + tokens = wl_texts.to_tokens(['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_case = True) - ) == set(['tAke']) + ), ['tAke']) - assert wl_matching.match_tokens( + compare_tokens_matched(wl_matching.match_tokens( main, search_terms = ['take'], - tokens = ['take', 'takes', 'took', 'taken', 'taking', 'test'], + tokens = wl_texts.to_tokens(['take', 'takes', 'took', 'taken', 'taking', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_whole_words = True) - ) == set(['take']) + ), ['take']) - assert wl_matching.match_tokens( + compare_tokens_matched(wl_matching.match_tokens( main, search_terms = ['takes'], - tokens = ['take', 'takes', 'took', 'taken', 'taking', 'test'], + tokens = wl_texts.to_tokens(['take', 'takes', 'took', 'taken', 'taking', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_inflected_forms = True) - ) == set(['take', 'takes', 'took', 'taken']) + ), ['take', 'takes', 'took', 'taken']) - assert wl_matching.match_tokens( + compare_tokens_matched(wl_matching.match_tokens( main, search_terms = ['take[sn]'], - tokens = ['take', 'takes', 'took', 'taken', 'taking', 'test'], + tokens = wl_texts.to_tokens(['take', 'takes', 'took', 'taken', 'taking', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(use_regex = True) - ) == set(['takes', 'taken']) - - assert wl_matching.match_tokens( - main, - search_terms = ['take'], - tokens = ['take', 'take_NN', 'taked_NN', 'test'], - lang = 'eng_us', - tagged = True, - settings = init_search_settings(match_whole_words = True, match_without_tags = True) - ) == set(['take', 'take_NN']) + ), ['takes', 'taken']) - assert wl_matching.match_tokens( + compare_tokens_matched(wl_matching.match_tokens( main, - search_terms = ['_NN'], - tokens = ['take', 'take_NN', 'taked_NN', 'taked_NNP', 'test'], + search_terms = ['takes'], + tokens = wl_texts.to_tokens( + ['take', 'takes', 'took', 'test'], + lang = 'eng_us', + tags = ['', '_NN', '_NN', '_TAKES'] + ), lang = 'eng_us', - tagged = True, - settings = init_search_settings(match_whole_words = True, match_tags = True) - ) == set(['take_NN', 'taked_NN']) + settings = init_search_settings(match_without_tags = True) + ), ['takes_NN']) - assert wl_matching.match_tokens( + compare_tokens_matched(wl_matching.match_tokens( main, search_terms = ['_NN'], - tokens = ['take', 'take_NN', 'taked_NN', 'taked_NNP', 'test'], + tokens = wl_texts.to_tokens( + ['take', 'takes', 'took', 'taken', 'test_NN'], + lang = 'eng_us', + tags = ['', '_NN', '_NN', '_NNP', ''] + ), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_whole_words = True, match_tags = True) - ) == set() + ), ['takes_NN', 'took_NN']) def test_match_ngrams(): - assert wl_matching.match_ngrams( + compare_ngrams_matched(wl_matching.match_ngrams( main, search_terms = ['tAke WaLK'], - tokens = ['take', 'TAKE', 'WaLK', 'wAlk', 'test'], + tokens = wl_texts.to_tokens( + ['take', 'TAKE', 'WaLK', 'test'], + lang = 'eng_us', + tags = ['', '', '', '_wAlk'] + ), lang = 'eng_us', - tagged = False, settings = init_search_settings() - ) == set([('take', 'WaLK'), ('take', 'wAlk'), ('TAKE', 'WaLK'), ('TAKE', 'wAlk')]) + ), [('take', 'WaLK'), ('take', 'test_wAlk'), ('TAKE', 'WaLK'), ('TAKE', 'test_wAlk')]) - assert wl_matching.match_ngrams( + compare_ngrams_matched(wl_matching.match_ngrams( main, search_terms = ['tAke WaLK'], - tokens = ['take', 'tAke', 'WALK', 'WaLK', 'test'], + tokens = wl_texts.to_tokens(['take', 'tAke', 'WALK', 'WaLK', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_case = True) - ) == set([('tAke', 'WaLK')]) + ), [('tAke', 'WaLK')]) - assert wl_matching.match_ngrams( + compare_ngrams_matched(wl_matching.match_ngrams( main, search_terms = ['take walk'], - tokens = ['take', 'takes', 'walk', 'walked', 'test'], + tokens = wl_texts.to_tokens(['take', 'takes', 'walk', 'walked', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_whole_words = True) - ) == set([('take', 'walk')]) + ), [('take', 'walk')]) - assert wl_matching.match_ngrams( + compare_ngrams_matched(wl_matching.match_ngrams( main, search_terms = ['took walks'], - tokens = ['take', 'takes', 'walk', 'walked', 'test'], + tokens = wl_texts.to_tokens(['take', 'takes', 'walk', 'walked', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_inflected_forms = True) - ) == set([('take', 'walk'), ('take', 'walked'), ('takes', 'walk'), ('takes', 'walked')]) + ), [('take', 'walk'), ('take', 'walked'), ('takes', 'walk'), ('takes', 'walked')]) - assert wl_matching.match_ngrams( + compare_ngrams_matched(wl_matching.match_ngrams( main, - search_terms = ['take[dn] walk(s|ing)'], - tokens = ['taked', 'taken', 'takes', 'walks', 'walking', 'walked', 'test'], + search_terms = ['took|taken walk(s|ing)'], + tokens = wl_texts.to_tokens(['took', 'taken', 'takes', 'walks', 'walking', 'walked', 'test'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, settings = init_search_settings(use_regex = True) - ) == set([('taked', 'walks'), ('taked', 'walking'), ('taken', 'walks'), ('taken', 'walking')]) + ), [('took', 'walks'), ('took', 'walking'), ('taken', 'walks'), ('taken', 'walking')]) - assert wl_matching.match_ngrams( + compare_ngrams_matched(wl_matching.match_ngrams( main, - search_terms = ['take walk'], - tokens = ['take', 'take_NN', 'taked_NN', 'walk', 'walk_NN', 'walking_NN', 'test'], + search_terms = ['takes walks', 'took walked'], + tokens = wl_texts.to_tokens( + ['take', 'takes', 'took', 'walk', 'walks', 'walked', 'test'], + lang = 'eng_us', + tags = ['', '_NN', '_NN', '', '_NN', '_NN', '_TAKES'] + ), lang = 'eng_us', - tagged = True, - settings = init_search_settings(match_whole_words = True, match_without_tags = True) - ) == set([('take', 'walk'), ('take', 'walk_NN'), ('take_NN', 'walk'), ('take_NN', 'walk_NN')]) + settings = init_search_settings(match_without_tags = True) + ), [('takes_NN', 'walks_NN'), ('took_NN', 'walked_NN')]) - assert wl_matching.match_ngrams( + compare_ngrams_matched(wl_matching.match_ngrams( main, search_terms = ['_NN _JJ'], - tokens = ['take', 'take_NN', 'taked_NN', 'taked_NNP', 'walk', 'walk_JJ', 'walked_JJ', 'walked_JJS', 'test'], - lang = 'eng_us', - tagged = True, - settings = init_search_settings(match_whole_words = True, match_tags = True) - ) == set([('take_NN', 'walk_JJ'), ('take_NN', 'walked_JJ'), ('taked_NN', 'walk_JJ'), ('taked_NN', 'walked_JJ')]) - - assert wl_matching.match_ngrams( - main, - search_terms = ['_NN'], - tokens = ['take', 'take_NN', 'taked_NN', 'taked_NNP', 'walk', 'walk_JJ', 'walked_JJ', 'walked_JJS', 'test'], + tokens = wl_texts.to_tokens( + ['take', 'takes', 'took', 'walk', 'walks', 'walked', 'test_JJ'], + lang = 'eng_us', + tags = ['', '_NN', '_NNP', '', '_JJ', '_JJS', ''] + ), lang = 'eng_us', - tagged = False, settings = init_search_settings(match_whole_words = True, match_tags = True) - ) == set() + ), [('takes_NN', 'walks_JJ')]) def test_match_search_terms_tokens(): - assert wl_matching.match_search_terms_tokens( + compare_tokens_matched(wl_matching.match_search_terms_tokens( main, - tokens = ['take'], + tokens = wl_texts.to_tokens(['take'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, token_settings = init_token_settings(), search_settings = init_search_settings(search_term = 'take') - ) == set(['take']) + ), ['take']) def test_match_search_terms_ngrams(): - assert wl_matching.match_search_terms_ngrams( + compare_ngrams_matched(wl_matching.match_search_terms_ngrams( main, - tokens = ['take', 'walk'], + tokens = wl_texts.to_tokens(['take', 'walk'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, token_settings = init_token_settings(), search_settings = init_search_settings(search_term = 'take walk') - ) == set([('take', 'walk')]) + ), [('take', 'walk')]) def init_context_settings( incl = False, incl_multi_search_mode = False, incl_search_term = '', incl_search_terms = None, @@ -399,44 +416,40 @@ def init_context_settings( return context_settings def test_match_search_terms_context(): - assert wl_matching.match_search_terms_context( + compare_context_matched(wl_matching.match_search_terms_context( main, - tokens = ['take'], + tokens = wl_texts.to_tokens(['take'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, token_settings = init_token_settings(), context_settings = init_context_settings() - ) == (set(), set()) + ), (set(), set())) - assert wl_matching.match_search_terms_context( + compare_context_matched(wl_matching.match_search_terms_context( main, - tokens = ['take', 'walk'], + tokens = wl_texts.to_tokens(['take', 'walk'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, token_settings = init_token_settings(), context_settings = init_context_settings(incl = True, incl_search_term = 'take walk') - ) == ({('take', 'walk')}, set()) + ), ({('take', 'walk')}, set())) - assert wl_matching.match_search_terms_context( + compare_context_matched(wl_matching.match_search_terms_context( main, - tokens = ['take', 'walk'], + tokens = wl_texts.to_tokens(['take', 'walk'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, token_settings = init_token_settings(), context_settings = init_context_settings(excl = True, excl_search_term = 'take walk') - ) == (set(), {('take', 'walk')}) + ), (set(), {('take', 'walk')})) - assert wl_matching.match_search_terms_context( + compare_context_matched(wl_matching.match_search_terms_context( main, - tokens = ['take', 'walk'], + tokens = wl_texts.to_tokens(['take', 'walk'], lang = 'eng_us'), lang = 'eng_us', - tagged = False, token_settings = init_token_settings(), context_settings = init_context_settings( incl = True, incl_search_term = 'take walk', excl = True, excl_search_term = 'take walk' ) - ) == ({('take', 'walk')}, {('take', 'walk')}) + ), ({('take', 'walk')}, {('take', 'walk')})) def test_check_context(): assert wl_matching.check_context( diff --git a/tests/tests_nlp/test_pos_tagging.py b/tests/tests_nlp/test_pos_tagging.py index 927911c92..477092ff7 100644 --- a/tests/tests_nlp/test_pos_tagging.py +++ b/tests/tests_nlp/test_pos_tagging.py @@ -19,7 +19,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import wl_pos_tagging, wl_word_tokenization +from wordless.wl_nlp import wl_pos_tagging, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_misc _, is_macos, _ = wl_misc.check_os() @@ -44,34 +44,109 @@ @pytest.mark.parametrize('lang, pos_tagger', test_pos_taggers) def test_pos_tag(lang, pos_tagger): + tests_lang_util_skipped = False + test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') + + tokens = wl_word_tokenization.wl_word_tokenize_flat( + main, + text = test_sentence, + lang = lang + ) + + match lang: + case 'eng_gb' | 'eng_us': + results = [('English', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('West', 'NNP'), ('Germanic', 'NNP'), ('language', 'NN'), ('in', 'IN'), ('the', 'DT'), ('Indo-European', 'JJ'), ('language', 'NN'), ('family', 'NN'), ('.', '.')] + results_universal = [('English', 'PROPN'), ('is', 'VERB'), ('a', 'DET'), ('West', 'PROPN'), ('Germanic', 'PROPN'), ('language', 'NOUN'), ('in', 'ADP/SCONJ'), ('the', 'DET'), ('Indo-European', 'ADJ'), ('language', 'NOUN'), ('family', 'NOUN'), ('.', 'PUNCT')] + case 'jpn': + results = [('日本語', '名詞-普通名詞-一般'), ('(', '補助記号-括弧開'), ('にほん', '名詞-固有名詞-地名-国'), ('ご', '接尾辞-名詞的-一般'), ('、', '補助記号-読点'), ('にっぽん', '名詞-固有名詞-地名-国'), ('ご', '接尾辞-名詞的-一般'), ('[', '補助記号-括弧開'), ('注釈', '名詞-普通名詞-サ変可能'), ('2', '名詞-数詞'), (']', '補助記号-括弧閉'), (')', '補助記号-括弧閉'), ('は', '助詞-係助詞'), ('、', '補助記号-読点'), ('日本', '名詞-固有名詞-地名-国'), ('国', '接尾辞-名詞的-一般'), ('内', '接尾辞-名詞的-一般'), ('や', '助詞-副助詞'), ('、', '補助記号-読点'), ('かつて', '副詞'), ('の', '助詞-格助詞'), ('日本', '名詞-固有名詞-地名-国'), ('領', '接尾辞-名詞的-一般'), ('だっ', '助動詞'), ('た', '助動詞'), ('国', '名詞-普通名詞-一般'), ('、', '補助記号-読点'), ('そして', '接続詞'), ('国外', '名詞-普通名詞-一般'), ('移民', '名詞-普通名詞-サ変可能'), ('や', '助詞-副助詞'), ('移住者', '名詞-普通名詞-一般'), ('を', '助詞-格助詞'), ('含む', '動詞-一般'), ('日本人', '名詞-普通名詞-一般'), ('同士', '接尾辞-名詞的-一般'), ('の', '助詞-格助詞'), ('間', '名詞-普通名詞-副詞可能'), ('で', '助詞-格助詞'), ('使用', '名詞-普通名詞-サ変可能'), ('さ', '動詞-非自立可能'), ('れ', '助動詞'), ('て', '助詞-接続助詞'), ('いる', '動詞-非自立可能'), ('言語', '名詞-普通名詞-一般'), ('。', '補助記号-句点')] + results_universal = [('日本語', 'NOUN'), ('(', 'PUNCT'), ('にほん', 'PROPN'), ('ご', 'NOUN'), ('、', 'PUNCT'), ('にっぽん', 'PROPN'), ('ご', 'NOUN'), ('[', 'PUNCT'), ('注釈', 'NOUN'), ('2', 'NUM'), (']', 'PUNCT'), (')', 'PUNCT'), ('は', 'ADP'), ('、', 'PUNCT'), ('日本', 'PROPN'), ('国', 'NOUN'), ('内', 'NOUN'), ('や', 'ADP'), ('、', 'PUNCT'), ('かつて', 'ADV'), ('の', 'ADP'), ('日本', 'PROPN'), ('領', 'NOUN'), ('だっ', 'AUX'), ('た', 'AUX'), ('国', 'NOUN'), ('、', 'PUNCT'), ('そして', 'CCONJ'), ('国外', 'NOUN'), ('移民', 'NOUN'), ('や', 'ADP'), ('移住者', 'NOUN'), ('を', 'ADP'), ('含む', 'VERB'), ('日本人', 'NOUN'), ('同士', 'NOUN'), ('の', 'ADP'), ('間', 'NOUN'), ('で', 'ADP'), ('使用', 'NOUN'), ('さ', 'AUX'), ('れ', 'AUX'), ('て', 'SCONJ'), ('いる', 'AUX'), ('言語', 'NOUN'), ('。', 'PUNCT')] + case 'khm': + results = [('ភាសា', 'n'), ('ខ្មែរ', 'n'), ('គឺជា', 'v'), ('ភាសា', 'n'), ('កំណើត', 'n'), ('របស់', 'o'), ('ជនជាតិ', 'n'), ('ខ្មែរ', 'n'), ('និង', 'o'), ('ជា', 'v'), ('ភាសា', 'n'), ('ផ្លូវការ', 'n'), ('របស់', 'o'), ('ប្រទេស', 'n'), ('កម្ពុជា', 'n'), ('។', '.')] + results_universal = [('ភាសា', 'NOUN'), ('ខ្មែរ', 'NOUN'), ('គឺជា', 'VERB'), ('ភាសា', 'NOUN'), ('កំណើត', 'NOUN'), ('របស់', 'PART'), ('ជនជាតិ', 'NOUN'), ('ខ្មែរ', 'NOUN'), ('និង', 'PART'), ('ជា', 'VERB'), ('ភាសា', 'NOUN'), ('ផ្លូវការ', 'NOUN'), ('របស់', 'PART'), ('ប្រទេស', 'NOUN'), ('កម្ពុជា', 'NOUN'), ('។', 'PUNCT')] + case 'kor': + results = [('세계', 'NNG'), ('여러', 'MM'), ('지역', 'NNG'), ('에', 'JKB'), ('한', 'MM'), ('민족', 'NNG'), ('인구', 'NNG'), ('가', 'JKS'), ('거주', 'NNG'), ('하', 'XSA'), ('게', 'EC'), ('되', 'VV'), ('면서', 'EC'), ('전', 'MM'), ('세계', 'NNG'), ('각지', 'NNG'), ('에서', 'JKB'), ('한국어', 'NNG'), ('가', 'JKS'), ('사용', 'NNG'), ('되', 'VV'), ('고', 'EC'), ('있', 'VX'), ('다', 'EF'), ('.', 'SF')] + results_universal = [('세계', 'NOUN'), ('여러', 'DET'), ('지역', 'NOUN'), ('에', 'ADP'), ('한', 'DET'), ('민족', 'NOUN'), ('인구', 'NOUN'), ('가', 'ADP'), ('거주', 'NOUN'), ('하', 'X'), ('게', 'X'), ('되', 'VERB'), ('면서', 'X'), ('전', 'DET'), ('세계', 'NOUN'), ('각지', 'NOUN'), ('에서', 'ADP'), ('한국어', 'NOUN'), ('가', 'ADP'), ('사용', 'NOUN'), ('되', 'VERB'), ('고', 'X'), ('있', 'AUX'), ('다', 'X'), ('.', 'PUNCT')] + case 'lao': + match pos_tagger: + case 'laonlp_seqlabeling': + results = [('ພາສາລາວ', 'N'), ('(', 'V'), ('Lao', 'PRN'), (':', 'PUNCT'), ('ລາວ', 'PRS'), (',', 'PUNCT'), ('[', 'PUNCT'), ('l', 'PRN'), ('áː', 'PRN'), ('w', 'PRN'), (']', 'PUNCT'), ('ຫຼື', 'COJ'), ('ພາສາລາວ', 'PRN'), (',', 'PUNCT'), ('[', 'N'), ('p', 'PRN'), ('ʰáː', 'PRN'), ('s', 'PRN'), ('ǎː', 'PRN'), ('l', 'PRN'), ('áː', 'PRN'), ('w', 'PRN'), ('])', 'PRN'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ຕະກູນ', 'PRN'), ('ໄທ', 'PRN'), ('-', 'PUNCT'), ('ກະໄດ', 'N'), ('ຂອງ', 'PRE'), ('ຄົນ', 'N'), ('ລາວ', 'PRS'), ('ໂດຍ', 'PRE'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ໃນປະເທດລາວ', 'N'), ('ເຊິ່ງ', 'REL'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ລັດຖະການ', 'N'), ('ຂອງ', 'PRE'), ('ສາທາລະນະລັດ', 'N'), ('ປະຊາທິປະໄຕ', 'N'), ('ປະຊາຊົນ', 'N'), ('ລາວ', 'PRS'), ('ຂອງ', 'PRE'), ('ປະຊາກອນ', 'N'), ('ປະມານ', 'IBQ'), ('7', 'V'), ('ລ້ານ', 'N'), ('ຄົນ', 'N'), ('ແລະ', 'COJ'), ('ໃນ', 'PRE'), ('ພື້ນທີ່', 'N'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'N'), ('ເໜືອ', 'PRN'), ('ຂອງ', 'PRE'), ('ປະເທດໄທ', 'PRN'), ('ທີ່ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ປະມານ', 'IBQ'), ('23', 'V'), ('ລ້ານ', 'N'), ('ຄົນ', 'N'), ('ທາງ', 'PRE'), ('ລັດຖະບານ', 'N'), ('ປະເທດໄທ', 'PRN'), ('ມີການສະໜັບສະໜຸນ', 'V'), ('ໃຫ້', 'PVA'), ('ເອີ້ນ', 'V'), ('ພາສາລາວ', 'N'), ('ຖິ່ນ', 'N'), ('ໄທ', 'PRN'), ('ວ່າ', 'COJ'), ('ພາສາລາວ', 'PRN'), ('ຖິ່ນ', 'PRN'), ('ອີສານ', 'N'), ('ນອກຈາກ', 'PRE'), ('ນີ້', 'DMN'), (',', 'PUNCT'), ('ຢູ່', 'PRE'), ('ທາງ', 'N'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'N'), ('ເໜືອ', 'N'), ('ຂອງ', 'PRE'), ('ປະເທດກຳປູເຈຍ', 'N'), ('ກໍ', 'IAC'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ພາສາລາວ', 'N'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] + results_universal = [('ພາສາລາວ', 'NOUN'), ('(', 'VERB'), ('Lao', 'PROPN'), (':', 'PUNCT'), ('ລາວ', 'PRON'), (',', 'PUNCT'), ('[', 'PUNCT'), ('l', 'PROPN'), ('áː', 'PROPN'), ('w', 'PROPN'), (']', 'PUNCT'), ('ຫຼື', 'CONJ'), ('ພາສາລາວ', 'PROPN'), (',', 'PUNCT'), ('[', 'NOUN'), ('p', 'PROPN'), ('ʰáː', 'PROPN'), ('s', 'PROPN'), ('ǎː', 'PROPN'), ('l', 'PROPN'), ('áː', 'PROPN'), ('w', 'PROPN'), ('])', 'PROPN'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ຕະກູນ', 'PROPN'), ('ໄທ', 'PROPN'), ('-', 'PUNCT'), ('ກະໄດ', 'NOUN'), ('ຂອງ', 'ADP'), ('ຄົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ໂດຍ', 'ADP'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ໃນປະເທດລາວ', 'NOUN'), ('ເຊິ່ງ', 'PRON'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ລັດຖະການ', 'NOUN'), ('ຂອງ', 'ADP'), ('ສາທາລະນະລັດ', 'NOUN'), ('ປະຊາທິປະໄຕ', 'NOUN'), ('ປະຊາຊົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ຂອງ', 'ADP'), ('ປະຊາກອນ', 'NOUN'), ('ປະມານ', 'DET'), ('7', 'VERB'), ('ລ້ານ', 'NOUN'), ('ຄົນ', 'NOUN'), ('ແລະ', 'CONJ'), ('ໃນ', 'ADP'), ('ພື້ນທີ່', 'NOUN'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'NOUN'), ('ເໜືອ', 'PROPN'), ('ຂອງ', 'ADP'), ('ປະເທດໄທ', 'PROPN'), ('ທີ່ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ປະມານ', 'DET'), ('23', 'VERB'), ('ລ້ານ', 'NOUN'), ('ຄົນ', 'NOUN'), ('ທາງ', 'ADP'), ('ລັດຖະບານ', 'NOUN'), ('ປະເທດໄທ', 'PROPN'), ('ມີການສະໜັບສະໜຸນ', 'VERB'), ('ໃຫ້', 'AUX'), ('ເອີ້ນ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຖິ່ນ', 'NOUN'), ('ໄທ', 'PROPN'), ('ວ່າ', 'CONJ'), ('ພາສາລາວ', 'PROPN'), ('ຖິ່ນ', 'PROPN'), ('ອີສານ', 'NOUN'), ('ນອກຈາກ', 'ADP'), ('ນີ້', 'PRON'), (',', 'PUNCT'), ('ຢູ່', 'ADP'), ('ທາງ', 'NOUN'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'NOUN'), ('ເໜືອ', 'NOUN'), ('ຂອງ', 'ADP'), ('ປະເທດກຳປູເຈຍ', 'NOUN'), ('ກໍ', 'DET'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] + case 'laonlp_yunshan_cup_2020': + results = [('ພາສາລາວ', 'PRN'), ('(', 'PUNCT'), ('Lao', 'PRN'), (':', 'PUNCT'), ('ລາວ', 'PRS'), (',', 'PUNCT'), ('[', 'COJ'), ('l', 'N'), ('áː', 'N'), ('w', 'N'), (']', 'PUNCT'), ('ຫຼື', 'COJ'), ('ພາສາລາວ', 'PRN'), (',', 'PUNCT'), ('[', 'PUNCT'), ('p', 'PRN'), ('ʰáː', 'PRN'), ('s', 'PRN'), ('ǎː', 'PRN'), ('l', 'PRN'), ('áː', 'PRN'), ('w', 'PRN'), ('])', 'PRN'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ຕະກູນ', 'PRN'), ('ໄທ', 'PRN'), ('-', 'PUNCT'), ('ກະໄດ', 'N'), ('ຂອງ', 'PRE'), ('ຄົນ', 'N'), ('ລາວ', 'PRS'), ('ໂດຍ', 'PRE'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ໃນປະເທດລາວ', 'N'), ('ເຊິ່ງ', 'REL'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ລັດຖະການ', 'N'), ('ຂອງ', 'PRE'), ('ສາທາລະນະລັດ', 'N'), ('ປະຊາທິປະໄຕ', 'N'), ('ປະຊາຊົນ', 'N'), ('ລາວ', 'PRS'), ('ຂອງ', 'PRE'), ('ປະຊາກອນ', 'N'), ('ປະມານ', 'IBQ'), ('7', 'V'), ('ລ້ານ', 'V'), ('ຄົນ', 'N'), ('ແລະ', 'COJ'), ('ໃນ', 'PRE'), ('ພື້ນທີ່', 'N'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'V'), ('ເໜືອ', 'PRN'), ('ຂອງ', 'PRE'), ('ປະເທດໄທ', 'PRN'), ('ທີ່ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ປະມານ', 'IBQ'), ('23', 'V'), ('ລ້ານ', 'CLF'), ('ຄົນ', 'N'), ('ທາງ', 'PRE'), ('ລັດຖະບານ', 'N'), ('ປະເທດໄທ', 'PRN'), ('ມີການສະໜັບສະໜຸນ', 'V'), ('ໃຫ້', 'PVA'), ('ເອີ້ນ', 'V'), ('ພາສາລາວ', 'N'), ('ຖິ່ນ', 'N'), ('ໄທ', 'PRN'), ('ວ່າ', 'COJ'), ('ພາສາລາວ', 'PRN'), ('ຖິ່ນ', 'PRN'), ('ອີສານ', 'N'), ('ນອກຈາກ', 'PRE'), ('ນີ້', 'DMN'), (',', 'PUNCT'), ('ຢູ່', 'ADV'), ('ທາງ', 'PRE'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'N'), ('ເໜືອ', 'N'), ('ຂອງ', 'PRE'), ('ປະເທດກຳປູເຈຍ', 'N'), ('ກໍ', 'IAC'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ພາສາລາວ', 'N'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] + results_universal = [('ພາສາລາວ', 'PROPN'), ('(', 'PUNCT'), ('Lao', 'PROPN'), (':', 'PUNCT'), ('ລາວ', 'PRON'), (',', 'PUNCT'), ('[', 'CONJ'), ('l', 'NOUN'), ('áː', 'NOUN'), ('w', 'NOUN'), (']', 'PUNCT'), ('ຫຼື', 'CONJ'), ('ພາສາລາວ', 'PROPN'), (',', 'PUNCT'), ('[', 'PUNCT'), ('p', 'PROPN'), ('ʰáː', 'PROPN'), ('s', 'PROPN'), ('ǎː', 'PROPN'), ('l', 'PROPN'), ('áː', 'PROPN'), ('w', 'PROPN'), ('])', 'PROPN'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ຕະກູນ', 'PROPN'), ('ໄທ', 'PROPN'), ('-', 'PUNCT'), ('ກະໄດ', 'NOUN'), ('ຂອງ', 'ADP'), ('ຄົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ໂດຍ', 'ADP'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ໃນປະເທດລາວ', 'NOUN'), ('ເຊິ່ງ', 'PRON'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ລັດຖະການ', 'NOUN'), ('ຂອງ', 'ADP'), ('ສາທາລະນະລັດ', 'NOUN'), ('ປະຊາທິປະໄຕ', 'NOUN'), ('ປະຊາຊົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ຂອງ', 'ADP'), ('ປະຊາກອນ', 'NOUN'), ('ປະມານ', 'DET'), ('7', 'VERB'), ('ລ້ານ', 'VERB'), ('ຄົນ', 'NOUN'), ('ແລະ', 'CONJ'), ('ໃນ', 'ADP'), ('ພື້ນທີ່', 'NOUN'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'VERB'), ('ເໜືອ', 'PROPN'), ('ຂອງ', 'ADP'), ('ປະເທດໄທ', 'PROPN'), ('ທີ່ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ປະມານ', 'DET'), ('23', 'VERB'), ('ລ້ານ', 'PART'), ('ຄົນ', 'NOUN'), ('ທາງ', 'ADP'), ('ລັດຖະບານ', 'NOUN'), ('ປະເທດໄທ', 'PROPN'), ('ມີການສະໜັບສະໜຸນ', 'VERB'), ('ໃຫ້', 'AUX'), ('ເອີ້ນ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຖິ່ນ', 'NOUN'), ('ໄທ', 'PROPN'), ('ວ່າ', 'CONJ'), ('ພາສາລາວ', 'PROPN'), ('ຖິ່ນ', 'PROPN'), ('ອີສານ', 'NOUN'), ('ນອກຈາກ', 'ADP'), ('ນີ້', 'PRON'), (',', 'PUNCT'), ('ຢູ່', 'ADV'), ('ທາງ', 'ADP'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'NOUN'), ('ເໜືອ', 'NOUN'), ('ຂອງ', 'ADP'), ('ປະເທດກຳປູເຈຍ', 'NOUN'), ('ກໍ', 'DET'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] + case _: + tests_lang_util_skipped = True + case 'rus': + match pos_tagger: + case 'nltk_perceptron_rus': + results = [('Ру́сский', 'A=m'), ('язы́к', 'S'), ('(', 'NONLEX'), ('МФА', 'S'), (':', 'NONLEX'), ('[', 'NONLEX'), ('ˈruskʲɪi̯', 'NONLEX'), ('jɪˈzɨk', 'NONLEX'), (']', 'NONLEX'), ('ⓘ', 'NONLEX'), (')', 'NONLEX'), ('[', 'NONLEX'), ('~', 'NONLEX'), ('3', 'NUM=ciph'), (']', 'NONLEX'), ('[', 'NONLEX'), ('⇨', 'NONLEX'), (']', 'NONLEX'), ('—', 'NONLEX'), ('язык', 'S'), ('восточнославянской', 'A=f'), ('группы', 'S'), ('славянской', 'A=f'), ('ветви', 'S'), ('индоевропейской', 'A=f'), ('языковой', 'A=f'), ('семьи', 'S'), (',', 'NONLEX'), ('национальный', 'A=m'), ('язык', 'S'), ('русского', 'A=m'), ('народа', 'S'), ('.', 'NONLEX')] + results_universal = [('Ру́сский', 'ADJ'), ('язы́к', 'NOUN'), ('(', 'PUNCT/SYM'), ('МФА', 'NOUN'), (':', 'PUNCT/SYM'), ('[', 'PUNCT/SYM'), ('ˈruskʲɪi̯', 'PUNCT/SYM'), ('jɪˈzɨk', 'PUNCT/SYM'), (']', 'PUNCT/SYM'), ('ⓘ', 'PUNCT/SYM'), (')', 'PUNCT/SYM'), ('[', 'PUNCT/SYM'), ('~', 'PUNCT/SYM'), ('3', 'NUM'), (']', 'PUNCT/SYM'), ('[', 'PUNCT/SYM'), ('⇨', 'PUNCT/SYM'), (']', 'PUNCT/SYM'), ('—', 'PUNCT/SYM'), ('язык', 'NOUN'), ('восточнославянской', 'ADJ'), ('группы', 'NOUN'), ('славянской', 'ADJ'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJ'), ('языковой', 'ADJ'), ('семьи', 'NOUN'), (',', 'PUNCT/SYM'), ('национальный', 'ADJ'), ('язык', 'NOUN'), ('русского', 'ADJ'), ('народа', 'NOUN'), ('.', 'PUNCT/SYM')] + case 'pymorphy3_morphological_analyzer': + results = [('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'), ('[', 'PNCT'), ('ˈruskʲɪi̯', 'UNKN'), ('jɪˈzɨk', 'UNKN'), (']', 'PNCT'), ('ⓘ', 'UNKN'), (')', 'PNCT'), ('[', 'PNCT'), ('~', 'UNKN'), ('3', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('⇨', 'UNKN'), (']', 'PNCT'), ('—', 'PNCT'), ('язык', 'NOUN'), ('восточнославянской', 'ADJF'), ('группы', 'NOUN'), ('славянской', 'ADJF'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJF'), ('языковой', 'ADJF'), ('семьи', 'NOUN'), (',', 'PNCT'), ('национальный', 'ADJF'), ('язык', 'NOUN'), ('русского', 'ADJF'), ('народа', 'NOUN'), ('.', 'PNCT')] + results_universal = [('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PUNCT'), ('МФА', 'SYM/X'), (':', 'PUNCT'), ('[', 'PUNCT'), ('ˈruskʲɪi̯', 'SYM/X'), ('jɪˈzɨk', 'SYM/X'), (']', 'PUNCT'), ('ⓘ', 'SYM/X'), (')', 'PUNCT'), ('[', 'PUNCT'), ('~', 'SYM/X'), ('3', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('⇨', 'SYM/X'), (']', 'PUNCT'), ('—', 'PUNCT'), ('язык', 'NOUN'), ('восточнославянской', 'ADJ'), ('группы', 'NOUN'), ('славянской', 'ADJ'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJ'), ('языковой', 'ADJ'), ('семьи', 'NOUN'), (',', 'PUNCT'), ('национальный', 'ADJ'), ('язык', 'NOUN'), ('русского', 'ADJ'), ('народа', 'NOUN'), ('.', 'PUNCT')] + case _: + tests_lang_util_skipped = True + case 'tha': + match pos_tagger: + case 'pythainlp_perceptron_blackboard': + results = [('ภาษาไทย', 'NN'), ('หรือ', 'CC'), ('ภาษาไทย', 'NN'), ('กลาง', 'NN'), ('เป็น', 'VV'), ('ภาษา', 'NN'), ('ใน', 'PS'), ('กลุ่ม', 'NN'), ('ภาษา', 'NN'), ('ไท', 'NN'), ('ซึ่ง', 'CC'), ('เป็น', 'VV'), ('กลุ่มย่อย', 'NN'), ('ของ', 'PS'), ('ตระกูล', 'NN'), ('ภาษา', 'NN'), ('ข', 'NN'), ('ร้า', 'NN'), ('-', 'PU'), ('ไท', 'NN'), ('และ', 'CC'), ('เป็น', 'VV'), ('ภาษาราชการ', 'NN'), ('และ', 'CC'), ('ภาษาประจำชาติ', 'NN'), ('ของ', 'PS'), ('ประเทศ', 'NN'), ('ไทย', 'NN'), ('[', 'NN'), ('3', 'NU'), ('][', 'CL'), ('4', 'NU'), (']', 'CL')] + results_universal = [('ภาษาไทย', 'NOUN'), ('หรือ', 'CCONJ'), ('ภาษาไทย', 'NOUN'), ('กลาง', 'NOUN'), ('เป็น', 'VERB'), ('ภาษา', 'NOUN'), ('ใน', 'ADP'), ('กลุ่ม', 'NOUN'), ('ภาษา', 'NOUN'), ('ไท', 'NOUN'), ('ซึ่ง', 'CCONJ'), ('เป็น', 'VERB'), ('กลุ่มย่อย', 'NOUN'), ('ของ', 'ADP'), ('ตระกูล', 'NOUN'), ('ภาษา', 'NOUN'), ('ข', 'NOUN'), ('ร้า', 'NOUN'), ('-', 'PUNCT'), ('ไท', 'NOUN'), ('และ', 'CCONJ'), ('เป็น', 'VERB'), ('ภาษาราชการ', 'NOUN'), ('และ', 'CCONJ'), ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศ', 'NOUN'), ('ไทย', 'NOUN'), ('[', 'NOUN'), ('3', 'NUM'), ('][', 'NOUN'), ('4', 'NUM'), (']', 'NOUN')] + case 'pythainlp_perceptron_orchid': + results = [('ภาษาไทย', 'NPRP'), ('หรือ', 'JCRG'), ('ภาษาไทย', 'NPRP'), ('กลาง', 'VATT'), ('เป็น', 'VSTA'), ('ภาษา', 'NCMN'), ('ใน', 'RPRE'), ('กลุ่ม', 'NCMN'), ('ภาษา', 'NCMN'), ('ไท', 'NCMN'), ('ซึ่ง', 'PREL'), ('เป็น', 'VSTA'), ('กลุ่มย่อย', 'NCMN'), ('ของ', 'RPRE'), ('ตระกูล', 'NCMN'), ('ภาษา', 'NCMN'), ('ข', 'NCMN'), ('ร้า', 'NCMN'), ('-', 'PUNC'), ('ไท', 'NCMN'), ('และ', 'JCRG'), ('เป็น', 'VSTA'), ('ภาษาราชการ', 'NCMN'), ('และ', 'JCRG'), ('ภาษาประจำชาติ', 'NCMN'), ('ของ', 'RPRE'), ('ประเทศ', 'NCMN'), ('ไทย', 'NPRP'), ('[', 'NCMN'), ('3', 'NCNM'), ('][', 'PUNC'), ('4', 'NCNM'), (']', 'CMTR')] + results_universal = [('ภาษาไทย', 'PROPN'), ('หรือ', 'CCONJ'), ('ภาษาไทย', 'PROPN'), ('กลาง', 'ADJ'), ('เป็น', 'VERB'), ('ภาษา', 'NOUN'), ('ใน', 'ADP'), ('กลุ่ม', 'NOUN'), ('ภาษา', 'NOUN'), ('ไท', 'NOUN'), ('ซึ่ง', 'SCONJ'), ('เป็น', 'VERB'), ('กลุ่มย่อย', 'NOUN'), ('ของ', 'ADP'), ('ตระกูล', 'NOUN'), ('ภาษา', 'NOUN'), ('ข', 'NOUN'), ('ร้า', 'NOUN'), ('-', 'PUNCT'), ('ไท', 'NOUN'), ('และ', 'CCONJ'), ('เป็น', 'VERB'), ('ภาษาราชการ', 'NOUN'), ('และ', 'CCONJ'), ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศ', 'NOUN'), ('ไทย', 'PROPN'), ('[', 'NOUN'), ('3', 'NOUN/NUM'), ('][', 'PUNCT'), ('4', 'NOUN/NUM'), (']', 'NOUN')] + case 'pythainlp_perceptron_pud': + results = results_universal = [('ภาษาไทย', 'NOUN'), ('หรือ', 'CCONJ'), ('ภาษาไทย', 'NOUN'), ('กลาง', 'NOUN'), ('เป็น', 'AUX'), ('ภาษา', 'NOUN'), ('ใน', 'ADP'), ('กลุ่ม', 'NOUN'), ('ภาษา', 'NOUN'), ('ไท', 'PROPN'), ('ซึ่ง', 'DET'), ('เป็น', 'AUX'), ('กลุ่มย่อย', 'NOUN'), ('ของ', 'ADP'), ('ตระกูล', 'NOUN'), ('ภาษา', 'NOUN'), ('ข', 'NOUN'), ('ร้า', 'NOUN'), ('-', 'PUNCT'), ('ไท', 'PROPN'), ('และ', 'CCONJ'), ('เป็น', 'AUX'), ('ภาษาราชการ', 'NOUN'), ('และ', 'CCONJ'), ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศ', 'NOUN'), ('ไทย', 'PROPN'), ('[', 'NOUN'), ('3', 'NUM'), ('][', 'NOUN'), ('4', 'NUM'), (']', 'NOUN')] + case _: + tests_lang_util_skipped = True + case 'bod': + results = [('བོད་', 'PROPN'), ('ཀྱི་', 'PART'), ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'NO_POS'), ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'NO_POS'), ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'PART'), ('ས་ཁུལ་', 'OTHER'), ('བལ་ཡུལ', 'PROPN'), ('།', 'PUNCT'), ('འབྲུག་', 'NOUN'), ('དང་', 'NO_POS'), ('འབྲས་ལྗོངས', 'OTHER'), ('།', 'PUNCT')] + results_universal = [('བོད་', 'PROPN'), ('ཀྱི་', 'PART'), ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'X'), ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'X'), ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'PART'), ('ས་ཁུལ་', 'X'), ('བལ་ཡུལ', 'PROPN'), ('།', 'PUNCT'), ('འབྲུག་', 'NOUN'), ('དང་', 'X'), ('འབྲས་ལྗོངས', 'X'), ('།', 'PUNCT')] + case 'ukr': + results = [('Украї́нська', 'ADJF'), ('мо́ва', 'ADJF'), ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'), ('[', 'PNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'UNKN'), ('ˈmɔwɑ̽', 'UNKN'), (']', 'PNCT'), (',', 'PNCT'), ('історичні', 'ADJF'), ('назви', 'NOUN'), ('—', 'PNCT'), ('ру́ська', 'ADJF'), ('[', 'PNCT'), ('10', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('11', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('12', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('*', 'PNCT'), ('1', 'NUMB'), (']', 'PNCT'), (')', 'PNCT'), ('—', 'PNCT'), ('національна', 'ADJF'), ('мова', 'NOUN'), ('українців', 'NOUN'), ('.', 'PNCT')] + results_universal = [('Украї́нська', 'ADJ'), ('мо́ва', 'ADJ'), ('(', 'PUNCT'), ('МФА', 'SYM/X'), (':', 'PUNCT'), ('[', 'PUNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'SYM/X'), ('ˈmɔwɑ̽', 'SYM/X'), (']', 'PUNCT'), (',', 'PUNCT'), ('історичні', 'ADJ'), ('назви', 'NOUN'), ('—', 'PUNCT'), ('ру́ська', 'ADJ'), ('[', 'PUNCT'), ('10', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('11', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('12', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('*', 'PUNCT'), ('1', 'NUM'), (']', 'PUNCT'), (')', 'PUNCT'), ('—', 'PUNCT'), ('національна', 'ADJ'), ('мова', 'NOUN'), ('українців', 'NOUN'), ('.', 'PUNCT')] + case 'vie': + results = [('Tiếng', 'N'), ('Việt', 'Np'), (',', 'CH'), ('cũng', 'R'), ('gọi là', 'X'), ('tiếng', 'N'), ('Việt Nam', 'Np'), ('[', 'V'), ('9', 'M'), (']', 'CH'), ('hay', 'C'), ('Việt ngữ', 'V'), ('là', 'V'), ('ngôn ngữ', 'N'), ('của', 'E'), ('người', 'Nc'), ('Việt', 'Np'), ('và', 'C'), ('là', 'V'), ('ngôn ngữ', 'N'), ('chính thức', 'A'), ('tại', 'E'), ('Việt Nam', 'Np'), ('.', 'CH')] + results_universal = [('Tiếng', 'NOUN'), ('Việt', 'PROPN'), (',', 'PUNCT'), ('cũng', 'X'), ('gọi là', 'X'), ('tiếng', 'NOUN'), ('Việt Nam', 'PROPN'), ('[', 'VERB'), ('9', 'NUM'), (']', 'PUNCT'), ('hay', 'CCONJ'), ('Việt ngữ', 'VERB'), ('là', 'VERB'), ('ngôn ngữ', 'NOUN'), ('của', 'ADP'), ('người', 'NOUN'), ('Việt', 'PROPN'), ('và', 'CCONJ'), ('là', 'VERB'), ('ngôn ngữ', 'NOUN'), ('chính thức', 'ADJ'), ('tại', 'ADP'), ('Việt Nam', 'PROPN'), ('.', 'PUNCT')] + case _: + raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + + if tests_lang_util_skipped: + raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(pos_tagger) + + wl_test_pos_tag_models(lang, pos_tagger, test_sentence, tokens, results, results_universal) + +def wl_test_pos_tag_models(lang, pos_tagger, test_sentence, tokens, results, results_universal): # Untokenized - tokens_tagged = wl_pos_tagging.wl_pos_tag( + tokens_untokenized = wl_pos_tagging.wl_pos_tag( main, - inputs = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), + inputs = test_sentence, lang = lang, pos_tagger = pos_tagger ) - tokens_tagged_universal = wl_pos_tagging.wl_pos_tag( + tokens_untokenized_universal = wl_pos_tagging.wl_pos_tag( main, - inputs = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), + inputs = test_sentence, lang = lang, pos_tagger = pos_tagger, tagset = 'universal' ) + # Remove separators between token and tags + tokens_tags_untokenized = [(str(token), token.tag[1:]) for token in tokens_untokenized] + tokens_tags_untokenized_universal = [(str(token), token.tag[1:]) for token in tokens_untokenized_universal] + + print(f'{lang} / {pos_tagger}:') + print(tokens_tags_untokenized) + print(f'{tokens_tags_untokenized_universal}\n') + # Tokenized - tokens = wl_word_tokenization.wl_word_tokenize_flat( - main, - text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), - lang = lang - ) - tokens_tagged_tokenized = wl_pos_tagging.wl_pos_tag( + tokens_tokenized = wl_pos_tagging.wl_pos_tag( main, inputs = tokens, lang = lang, pos_tagger = pos_tagger ) - tokens_tagged_universal_tokenized = wl_pos_tagging.wl_pos_tag( + tokens_tokenized_universal = wl_pos_tagging.wl_pos_tag( main, inputs = tokens, lang = lang, @@ -79,91 +154,50 @@ def test_pos_tag(lang, pos_tagger): tagset = 'universal' ) - print(f'{lang} / {pos_tagger}:') - print(tokens_tagged) - print(f'{tokens_tagged_universal}\n') + tokens_tags_tokenized = [(str(token), token.tag) for token in tokens_tokenized] + tokens_tags_tokenized_universal = [(str(token), token.tag) for token in tokens_tokenized_universal] + + assert tokens_tags_untokenized == results + assert tokens_tags_untokenized_universal == results_universal # Check for empty tags - assert tokens_tagged - assert tokens_tagged_universal - assert tokens_tagged_tokenized - assert tokens_tagged_universal_tokenized - assert all((tag for token, tag in tokens_tagged)) - assert all((tag for token, tag in tokens_tagged_universal)) - assert all((tag for token, tag in tokens_tagged_tokenized)) - assert all((tag for token, tag in tokens_tagged_universal_tokenized)) + assert tokens_tags_untokenized + assert tokens_tags_untokenized_universal + assert tokens_tags_tokenized + assert tokens_tags_tokenized_universal + assert all((tag for token, tag in tokens_tags_untokenized)) + assert all((tag for token, tag in tokens_tags_untokenized_universal)) + assert all((tag for token, tag in tokens_tags_tokenized)) + assert all((tag for token, tag in tokens_tags_tokenized_universal)) + # Universal tags should not all be "X" - assert any((tag for token, tag in tokens_tagged_universal if tag != 'X')) - assert any((tag for token, tag in tokens_tagged_universal_tokenized if tag != 'X')) + assert any((tag for token, tag in tokens_tags_untokenized_universal if tag != 'X')) + assert any((tag for token, tag in tokens_tags_tokenized_universal if tag != 'X')) # Tokenization should not be modified - assert len(tokens) == len(tokens_tagged_tokenized) == len(tokens_tagged_universal_tokenized) + assert len(tokens) == len(tokens_tags_tokenized) == len(tokens_tags_tokenized_universal) - # Long texts - tokens_tagged_tokenized_long = wl_pos_tagging.wl_pos_tag( + # Long + tokens_long = wl_pos_tagging.wl_pos_tag( main, - inputs = [str(i) for i in range(101) for j in range(10)], + inputs = wl_texts.to_tokens(wl_test_lang_examples.TOKENS_LONG, lang = lang), lang = lang, pos_tagger = pos_tagger ) - assert [token[0] for token in tokens_tagged_tokenized_long] == [str(i) for i in range(101) for j in range(10)] - - tests_lang_util_skipped = False + assert [str(token) for token in tokens_long] == wl_test_lang_examples.TOKENS_LONG - if lang.startswith('eng_'): - assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('West', 'NNP'), ('Germanic', 'NNP'), ('language', 'NN'), ('in', 'IN'), ('the', 'DT'), ('Indo-European', 'JJ'), ('language', 'NN'), ('family', 'NN'), ('.', '.')] - assert tokens_tagged_universal == [('English', 'PROPN'), ('is', 'VERB'), ('a', 'DET'), ('West', 'PROPN'), ('Germanic', 'PROPN'), ('language', 'NOUN'), ('in', 'ADP/SCONJ'), ('the', 'DET'), ('Indo-European', 'ADJ'), ('language', 'NOUN'), ('family', 'NOUN'), ('.', 'PUNCT')] - elif lang == 'jpn': - assert tokens_tagged == [('日本語', '名詞-普通名詞-一般'), ('(', '補助記号-括弧開'), ('にほん', '名詞-固有名詞-地名-国'), ('ご', '接尾辞-名詞的-一般'), ('、', '補助記号-読点'), ('にっぽん', '名詞-固有名詞-地名-国'), ('ご', '接尾辞-名詞的-一般'), ('[', '補助記号-括弧開'), ('注釈', '名詞-普通名詞-サ変可能'), ('2', '名詞-数詞'), (']', '補助記号-括弧閉'), (')', '補助記号-括弧閉'), ('は', '助詞-係助詞'), ('、', '補助記号-読点'), ('日本', '名詞-固有名詞-地名-国'), ('国', '接尾辞-名詞的-一般'), ('内', '接尾辞-名詞的-一般'), ('や', '助詞-副助詞'), ('、', '補助記号-読点'), ('かつて', '副詞'), ('の', '助詞-格助詞'), ('日本', '名詞-固有名詞-地名-国'), ('領', '接尾辞-名詞的-一般'), ('だっ', '助動詞'), ('た', '助動詞'), ('国', '名詞-普通名詞-一般'), ('、', '補助記号-読点'), ('そして', '接続詞'), ('国外', '名詞-普通名詞-一般'), ('移民', '名詞-普通名詞-サ変可能'), ('や', '助詞-副助詞'), ('移住者', '名詞-普通名詞-一般'), ('を', '助詞-格助詞'), ('含む', '動詞-一般'), ('日本人', '名詞-普通名詞-一般'), ('同士', '接尾辞-名詞的-一般'), ('の', '助詞-格助詞'), ('間', '名詞-普通名詞-副詞可能'), ('で', '助詞-格助詞'), ('使用', '名詞-普通名詞-サ変可能'), ('さ', '動詞-非自立可能'), ('れ', '助動詞'), ('て', '助詞-接続助詞'), ('いる', '動詞-非自立可能'), ('言語', '名詞-普通名詞-一般'), ('。', '補助記号-句点')] - assert tokens_tagged_universal == [('日本語', 'NOUN'), ('(', 'PUNCT'), ('にほん', 'PROPN'), ('ご', 'NOUN'), ('、', 'PUNCT'), ('にっぽん', 'PROPN'), ('ご', 'NOUN'), ('[', 'PUNCT'), ('注釈', 'NOUN'), ('2', 'NUM'), (']', 'PUNCT'), (')', 'PUNCT'), ('は', 'ADP'), ('、', 'PUNCT'), ('日本', 'PROPN'), ('国', 'NOUN'), ('内', 'NOUN'), ('や', 'ADP'), ('、', 'PUNCT'), ('かつて', 'ADV'), ('の', 'ADP'), ('日本', 'PROPN'), ('領', 'NOUN'), ('だっ', 'AUX'), ('た', 'AUX'), ('国', 'NOUN'), ('、', 'PUNCT'), ('そして', 'CCONJ'), ('国外', 'NOUN'), ('移民', 'NOUN'), ('や', 'ADP'), ('移住者', 'NOUN'), ('を', 'ADP'), ('含む', 'VERB'), ('日本人', 'NOUN'), ('同士', 'NOUN'), ('の', 'ADP'), ('間', 'NOUN'), ('で', 'ADP'), ('使用', 'NOUN'), ('さ', 'AUX'), ('れ', 'AUX'), ('て', 'SCONJ'), ('いる', 'AUX'), ('言語', 'NOUN'), ('。', 'PUNCT')] - elif lang == 'khm': - assert tokens_tagged == [('ភាសា', 'n'), ('ខ្មែរ', 'n'), ('គឺជា', 'v'), ('ភាសា', 'n'), ('កំណើត', 'n'), ('របស់', 'o'), ('ជនជាតិ', 'n'), ('ខ្មែរ', 'n'), ('និង', 'o'), ('ជា', 'v'), ('ភាសា', 'n'), ('ផ្លូវការ', 'n'), ('របស់', 'o'), ('ប្រទេស', 'n'), ('កម្ពុជា', 'n'), ('។', '.')] - assert tokens_tagged_universal == [('ភាសា', 'NOUN'), ('ខ្មែរ', 'NOUN'), ('គឺជា', 'VERB'), ('ភាសា', 'NOUN'), ('កំណើត', 'NOUN'), ('របស់', 'PART'), ('ជនជាតិ', 'NOUN'), ('ខ្មែរ', 'NOUN'), ('និង', 'PART'), ('ជា', 'VERB'), ('ភាសា', 'NOUN'), ('ផ្លូវការ', 'NOUN'), ('របស់', 'PART'), ('ប្រទេស', 'NOUN'), ('កម្ពុជា', 'NOUN'), ('។', 'PUNCT')] - elif lang == 'kor': - assert tokens_tagged == [('세계', 'NNG'), ('여러', 'MM'), ('지역', 'NNG'), ('에', 'JKB'), ('한', 'MM'), ('민족', 'NNG'), ('인구', 'NNG'), ('가', 'JKS'), ('거주', 'NNG'), ('하', 'XSA'), ('게', 'EC'), ('되', 'VV'), ('면서', 'EC'), ('전', 'MM'), ('세계', 'NNG'), ('각지', 'NNG'), ('에서', 'JKB'), ('한국어', 'NNG'), ('가', 'JKS'), ('사용', 'NNG'), ('되', 'VV'), ('고', 'EC'), ('있', 'VX'), ('다', 'EF'), ('.', 'SF')] - assert tokens_tagged_universal == [('세계', 'NOUN'), ('여러', 'DET'), ('지역', 'NOUN'), ('에', 'ADP'), ('한', 'DET'), ('민족', 'NOUN'), ('인구', 'NOUN'), ('가', 'ADP'), ('거주', 'NOUN'), ('하', 'X'), ('게', 'X'), ('되', 'VERB'), ('면서', 'X'), ('전', 'DET'), ('세계', 'NOUN'), ('각지', 'NOUN'), ('에서', 'ADP'), ('한국어', 'NOUN'), ('가', 'ADP'), ('사용', 'NOUN'), ('되', 'VERB'), ('고', 'X'), ('있', 'AUX'), ('다', 'X'), ('.', 'PUNCT')] - elif lang == 'lao': - if pos_tagger == 'laonlp_seqlabeling': - assert tokens_tagged == [('ພາສາລາວ', 'N'), ('(', 'V'), ('Lao', 'PRN'), (':', 'PUNCT'), ('ລາວ', 'PRS'), (',', 'PUNCT'), ('[', 'PUNCT'), ('l', 'PRN'), ('áː', 'PRN'), ('w', 'PRN'), (']', 'PUNCT'), ('ຫຼື', 'COJ'), ('ພາສາລາວ', 'PRN'), (',', 'PUNCT'), ('[', 'N'), ('p', 'PRN'), ('ʰáː', 'PRN'), ('s', 'PRN'), ('ǎː', 'PRN'), ('l', 'PRN'), ('áː', 'PRN'), ('w', 'PRN'), ('])', 'PRN'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ຕະກູນ', 'PRN'), ('ໄທ', 'PRN'), ('-', 'PUNCT'), ('ກະໄດ', 'N'), ('ຂອງ', 'PRE'), ('ຄົນ', 'N'), ('ລາວ', 'PRS'), ('ໂດຍ', 'PRE'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ໃນປະເທດລາວ', 'N'), ('ເຊິ່ງ', 'REL'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ລັດຖະການ', 'N'), ('ຂອງ', 'PRE'), ('ສາທາລະນະລັດ', 'N'), ('ປະຊາທິປະໄຕ', 'N'), ('ປະຊາຊົນ', 'N'), ('ລາວ', 'PRS'), ('ຂອງ', 'PRE'), ('ປະຊາກອນ', 'N'), ('ປະມານ', 'IBQ'), ('7', 'V'), ('ລ້ານ', 'N'), ('ຄົນ', 'N'), ('ແລະ', 'COJ'), ('ໃນ', 'PRE'), ('ພື້ນທີ່', 'N'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'N'), ('ເໜືອ', 'PRN'), ('ຂອງ', 'PRE'), ('ປະເທດໄທ', 'PRN'), ('ທີ່ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ປະມານ', 'IBQ'), ('23', 'V'), ('ລ້ານ', 'N'), ('ຄົນ', 'N'), ('ທາງ', 'PRE'), ('ລັດຖະບານ', 'N'), ('ປະເທດໄທ', 'PRN'), ('ມີການສະໜັບສະໜຸນ', 'V'), ('ໃຫ້', 'PVA'), ('ເອີ້ນ', 'V'), ('ພາສາລາວ', 'N'), ('ຖິ່ນ', 'N'), ('ໄທ', 'PRN'), ('ວ່າ', 'COJ'), ('ພາສາລາວ', 'PRN'), ('ຖິ່ນ', 'PRN'), ('ອີສານ', 'N'), ('ນອກຈາກ', 'PRE'), ('ນີ້', 'DMN'), (',', 'PUNCT'), ('ຢູ່', 'PRE'), ('ທາງ', 'N'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'N'), ('ເໜືອ', 'N'), ('ຂອງ', 'PRE'), ('ປະເທດກຳປູເຈຍ', 'N'), ('ກໍ', 'IAC'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ພາສາລາວ', 'N'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] - assert tokens_tagged_universal == [('ພາສາລາວ', 'NOUN'), ('(', 'VERB'), ('Lao', 'PROPN'), (':', 'PUNCT'), ('ລາວ', 'PRON'), (',', 'PUNCT'), ('[', 'PUNCT'), ('l', 'PROPN'), ('áː', 'PROPN'), ('w', 'PROPN'), (']', 'PUNCT'), ('ຫຼື', 'CONJ'), ('ພາສາລາວ', 'PROPN'), (',', 'PUNCT'), ('[', 'NOUN'), ('p', 'PROPN'), ('ʰáː', 'PROPN'), ('s', 'PROPN'), ('ǎː', 'PROPN'), ('l', 'PROPN'), ('áː', 'PROPN'), ('w', 'PROPN'), ('])', 'PROPN'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ຕະກູນ', 'PROPN'), ('ໄທ', 'PROPN'), ('-', 'PUNCT'), ('ກະໄດ', 'NOUN'), ('ຂອງ', 'ADP'), ('ຄົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ໂດຍ', 'ADP'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ໃນປະເທດລາວ', 'NOUN'), ('ເຊິ່ງ', 'PRON'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ລັດຖະການ', 'NOUN'), ('ຂອງ', 'ADP'), ('ສາທາລະນະລັດ', 'NOUN'), ('ປະຊາທິປະໄຕ', 'NOUN'), ('ປະຊາຊົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ຂອງ', 'ADP'), ('ປະຊາກອນ', 'NOUN'), ('ປະມານ', 'DET'), ('7', 'VERB'), ('ລ້ານ', 'NOUN'), ('ຄົນ', 'NOUN'), ('ແລະ', 'CONJ'), ('ໃນ', 'ADP'), ('ພື້ນທີ່', 'NOUN'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'NOUN'), ('ເໜືອ', 'PROPN'), ('ຂອງ', 'ADP'), ('ປະເທດໄທ', 'PROPN'), ('ທີ່ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ປະມານ', 'DET'), ('23', 'VERB'), ('ລ້ານ', 'NOUN'), ('ຄົນ', 'NOUN'), ('ທາງ', 'ADP'), ('ລັດຖະບານ', 'NOUN'), ('ປະເທດໄທ', 'PROPN'), ('ມີການສະໜັບສະໜຸນ', 'VERB'), ('ໃຫ້', 'AUX'), ('ເອີ້ນ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຖິ່ນ', 'NOUN'), ('ໄທ', 'PROPN'), ('ວ່າ', 'CONJ'), ('ພາສາລາວ', 'PROPN'), ('ຖິ່ນ', 'PROPN'), ('ອີສານ', 'NOUN'), ('ນອກຈາກ', 'ADP'), ('ນີ້', 'PRON'), (',', 'PUNCT'), ('ຢູ່', 'ADP'), ('ທາງ', 'NOUN'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'NOUN'), ('ເໜືອ', 'NOUN'), ('ຂອງ', 'ADP'), ('ປະເທດກຳປູເຈຍ', 'NOUN'), ('ກໍ', 'DET'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] - elif pos_tagger == 'laonlp_yunshan_cup_2020': - assert tokens_tagged == [('ພາສາລາວ', 'PRN'), ('(', 'PUNCT'), ('Lao', 'PRN'), (':', 'PUNCT'), ('ລາວ', 'PRS'), (',', 'PUNCT'), ('[', 'COJ'), ('l', 'N'), ('áː', 'N'), ('w', 'N'), (']', 'PUNCT'), ('ຫຼື', 'COJ'), ('ພາສາລາວ', 'PRN'), (',', 'PUNCT'), ('[', 'PUNCT'), ('p', 'PRN'), ('ʰáː', 'PRN'), ('s', 'PRN'), ('ǎː', 'PRN'), ('l', 'PRN'), ('áː', 'PRN'), ('w', 'PRN'), ('])', 'PRN'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ຕະກູນ', 'PRN'), ('ໄທ', 'PRN'), ('-', 'PUNCT'), ('ກະໄດ', 'N'), ('ຂອງ', 'PRE'), ('ຄົນ', 'N'), ('ລາວ', 'PRS'), ('ໂດຍ', 'PRE'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ໃນປະເທດລາວ', 'N'), ('ເຊິ່ງ', 'REL'), ('ເປັນ', 'V'), ('ພາສາ', 'N'), ('ລັດຖະການ', 'N'), ('ຂອງ', 'PRE'), ('ສາທາລະນະລັດ', 'N'), ('ປະຊາທິປະໄຕ', 'N'), ('ປະຊາຊົນ', 'N'), ('ລາວ', 'PRS'), ('ຂອງ', 'PRE'), ('ປະຊາກອນ', 'N'), ('ປະມານ', 'IBQ'), ('7', 'V'), ('ລ້ານ', 'V'), ('ຄົນ', 'N'), ('ແລະ', 'COJ'), ('ໃນ', 'PRE'), ('ພື້ນທີ່', 'N'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'V'), ('ເໜືອ', 'PRN'), ('ຂອງ', 'PRE'), ('ປະເທດໄທ', 'PRN'), ('ທີ່ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ປະມານ', 'IBQ'), ('23', 'V'), ('ລ້ານ', 'CLF'), ('ຄົນ', 'N'), ('ທາງ', 'PRE'), ('ລັດຖະບານ', 'N'), ('ປະເທດໄທ', 'PRN'), ('ມີການສະໜັບສະໜຸນ', 'V'), ('ໃຫ້', 'PVA'), ('ເອີ້ນ', 'V'), ('ພາສາລາວ', 'N'), ('ຖິ່ນ', 'N'), ('ໄທ', 'PRN'), ('ວ່າ', 'COJ'), ('ພາສາລາວ', 'PRN'), ('ຖິ່ນ', 'PRN'), ('ອີສານ', 'N'), ('ນອກຈາກ', 'PRE'), ('ນີ້', 'DMN'), (',', 'PUNCT'), ('ຢູ່', 'ADV'), ('ທາງ', 'PRE'), ('ພາກ', 'N'), ('ຕາເວັນອອກສຽງ', 'N'), ('ເໜືອ', 'N'), ('ຂອງ', 'PRE'), ('ປະເທດກຳປູເຈຍ', 'N'), ('ກໍ', 'IAC'), ('ມີ', 'V'), ('ຄົນ', 'N'), ('ເວົ້າ', 'V'), ('ພາສາລາວ', 'N'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] - assert tokens_tagged_universal == [('ພາສາລາວ', 'PROPN'), ('(', 'PUNCT'), ('Lao', 'PROPN'), (':', 'PUNCT'), ('ລາວ', 'PRON'), (',', 'PUNCT'), ('[', 'CONJ'), ('l', 'NOUN'), ('áː', 'NOUN'), ('w', 'NOUN'), (']', 'PUNCT'), ('ຫຼື', 'CONJ'), ('ພາສາລາວ', 'PROPN'), (',', 'PUNCT'), ('[', 'PUNCT'), ('p', 'PROPN'), ('ʰáː', 'PROPN'), ('s', 'PROPN'), ('ǎː', 'PROPN'), ('l', 'PROPN'), ('áː', 'PROPN'), ('w', 'PROPN'), ('])', 'PROPN'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ຕະກູນ', 'PROPN'), ('ໄທ', 'PROPN'), ('-', 'PUNCT'), ('ກະໄດ', 'NOUN'), ('ຂອງ', 'ADP'), ('ຄົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ໂດຍ', 'ADP'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ໃນປະເທດລາວ', 'NOUN'), ('ເຊິ່ງ', 'PRON'), ('ເປັນ', 'VERB'), ('ພາສາ', 'NOUN'), ('ລັດຖະການ', 'NOUN'), ('ຂອງ', 'ADP'), ('ສາທາລະນະລັດ', 'NOUN'), ('ປະຊາທິປະໄຕ', 'NOUN'), ('ປະຊາຊົນ', 'NOUN'), ('ລາວ', 'PRON'), ('ຂອງ', 'ADP'), ('ປະຊາກອນ', 'NOUN'), ('ປະມານ', 'DET'), ('7', 'VERB'), ('ລ້ານ', 'VERB'), ('ຄົນ', 'NOUN'), ('ແລະ', 'CONJ'), ('ໃນ', 'ADP'), ('ພື້ນທີ່', 'NOUN'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'VERB'), ('ເໜືອ', 'PROPN'), ('ຂອງ', 'ADP'), ('ປະເທດໄທ', 'PROPN'), ('ທີ່ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ປະມານ', 'DET'), ('23', 'VERB'), ('ລ້ານ', 'PART'), ('ຄົນ', 'NOUN'), ('ທາງ', 'ADP'), ('ລັດຖະບານ', 'NOUN'), ('ປະເທດໄທ', 'PROPN'), ('ມີການສະໜັບສະໜຸນ', 'VERB'), ('ໃຫ້', 'AUX'), ('ເອີ້ນ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຖິ່ນ', 'NOUN'), ('ໄທ', 'PROPN'), ('ວ່າ', 'CONJ'), ('ພາສາລາວ', 'PROPN'), ('ຖິ່ນ', 'PROPN'), ('ອີສານ', 'NOUN'), ('ນອກຈາກ', 'ADP'), ('ນີ້', 'PRON'), (',', 'PUNCT'), ('ຢູ່', 'ADV'), ('ທາງ', 'ADP'), ('ພາກ', 'NOUN'), ('ຕາເວັນອອກສຽງ', 'NOUN'), ('ເໜືອ', 'NOUN'), ('ຂອງ', 'ADP'), ('ປະເທດກຳປູເຈຍ', 'NOUN'), ('ກໍ', 'DET'), ('ມີ', 'VERB'), ('ຄົນ', 'NOUN'), ('ເວົ້າ', 'VERB'), ('ພາສາລາວ', 'NOUN'), ('ຄືກັນ', 'ADJ'), ('.', 'PUNCT')] - elif lang == 'rus': - if pos_tagger == 'nltk_perceptron_rus': - assert tokens_tagged == [('Ру́сский', 'A=m'), ('язы́к', 'S'), ('(', 'NONLEX'), ('МФА', 'S'), (':', 'NONLEX'), ('[', 'NONLEX'), ('ˈruskʲɪi̯', 'NONLEX'), ('jɪˈzɨk', 'NONLEX'), (']', 'NONLEX'), ('ⓘ', 'NONLEX'), (')', 'NONLEX'), ('[', 'NONLEX'), ('~', 'NONLEX'), ('3', 'NUM=ciph'), (']', 'NONLEX'), ('[', 'NONLEX'), ('⇨', 'NONLEX'), (']', 'NONLEX'), ('—', 'NONLEX'), ('язык', 'S'), ('восточнославянской', 'A=f'), ('группы', 'S'), ('славянской', 'A=f'), ('ветви', 'S'), ('индоевропейской', 'A=f'), ('языковой', 'A=f'), ('семьи', 'S'), (',', 'NONLEX'), ('национальный', 'A=m'), ('язык', 'S'), ('русского', 'A=m'), ('народа', 'S'), ('.', 'NONLEX')] - assert tokens_tagged_universal == [('Ру́сский', 'ADJ'), ('язы́к', 'NOUN'), ('(', 'PUNCT/SYM'), ('МФА', 'NOUN'), (':', 'PUNCT/SYM'), ('[', 'PUNCT/SYM'), ('ˈruskʲɪi̯', 'PUNCT/SYM'), ('jɪˈzɨk', 'PUNCT/SYM'), (']', 'PUNCT/SYM'), ('ⓘ', 'PUNCT/SYM'), (')', 'PUNCT/SYM'), ('[', 'PUNCT/SYM'), ('~', 'PUNCT/SYM'), ('3', 'NUM'), (']', 'PUNCT/SYM'), ('[', 'PUNCT/SYM'), ('⇨', 'PUNCT/SYM'), (']', 'PUNCT/SYM'), ('—', 'PUNCT/SYM'), ('язык', 'NOUN'), ('восточнославянской', 'ADJ'), ('группы', 'NOUN'), ('славянской', 'ADJ'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJ'), ('языковой', 'ADJ'), ('семьи', 'NOUN'), (',', 'PUNCT/SYM'), ('национальный', 'ADJ'), ('язык', 'NOUN'), ('русского', 'ADJ'), ('народа', 'NOUN'), ('.', 'PUNCT/SYM')] - elif pos_tagger == 'pymorphy3_morphological_analyzer': - assert tokens_tagged == [('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'), ('[', 'PNCT'), ('ˈruskʲɪi̯', 'UNKN'), ('jɪˈzɨk', 'UNKN'), (']', 'PNCT'), ('ⓘ', 'UNKN'), (')', 'PNCT'), ('[', 'PNCT'), ('~', 'UNKN'), ('3', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('⇨', 'UNKN'), (']', 'PNCT'), ('—', 'PNCT'), ('язык', 'NOUN'), ('восточнославянской', 'ADJF'), ('группы', 'NOUN'), ('славянской', 'ADJF'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJF'), ('языковой', 'ADJF'), ('семьи', 'NOUN'), (',', 'PNCT'), ('национальный', 'ADJF'), ('язык', 'NOUN'), ('русского', 'ADJF'), ('народа', 'NOUN'), ('.', 'PNCT')] - assert tokens_tagged_universal == [('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PUNCT'), ('МФА', 'SYM/X'), (':', 'PUNCT'), ('[', 'PUNCT'), ('ˈruskʲɪi̯', 'SYM/X'), ('jɪˈzɨk', 'SYM/X'), (']', 'PUNCT'), ('ⓘ', 'SYM/X'), (')', 'PUNCT'), ('[', 'PUNCT'), ('~', 'SYM/X'), ('3', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('⇨', 'SYM/X'), (']', 'PUNCT'), ('—', 'PUNCT'), ('язык', 'NOUN'), ('восточнославянской', 'ADJ'), ('группы', 'NOUN'), ('славянской', 'ADJ'), ('ветви', 'NOUN'), ('индоевропейской', 'ADJ'), ('языковой', 'ADJ'), ('семьи', 'NOUN'), (',', 'PUNCT'), ('национальный', 'ADJ'), ('язык', 'NOUN'), ('русского', 'ADJ'), ('народа', 'NOUN'), ('.', 'PUNCT')] - else: - tests_lang_util_skipped = True - elif lang == 'tha': - if pos_tagger == 'pythainlp_perceptron_blackboard': - assert tokens_tagged == [('ภาษาไทย', 'NN'), ('หรือ', 'CC'), ('ภาษาไทย', 'NN'), ('กลาง', 'NN'), ('เป็น', 'VV'), ('ภาษา', 'NN'), ('ใน', 'PS'), ('กลุ่ม', 'NN'), ('ภาษา', 'NN'), ('ไท', 'NN'), ('ซึ่ง', 'CC'), ('เป็น', 'VV'), ('กลุ่มย่อย', 'NN'), ('ของ', 'PS'), ('ตระกูล', 'NN'), ('ภาษา', 'NN'), ('ข', 'NN'), ('ร้า', 'NN'), ('-', 'PU'), ('ไท', 'NN'), ('และ', 'CC'), ('เป็น', 'VV'), ('ภาษาราชการ', 'NN'), ('และ', 'CC'), ('ภาษาประจำชาติ', 'NN'), ('ของ', 'PS'), ('ประเทศ', 'NN'), ('ไทย', 'NN'), ('[', 'NN'), ('3', 'NU'), ('][', 'CL'), ('4', 'NU'), (']', 'CL')] - assert tokens_tagged_universal == [('ภาษาไทย', 'NOUN'), ('หรือ', 'CCONJ'), ('ภาษาไทย', 'NOUN'), ('กลาง', 'NOUN'), ('เป็น', 'VERB'), ('ภาษา', 'NOUN'), ('ใน', 'ADP'), ('กลุ่ม', 'NOUN'), ('ภาษา', 'NOUN'), ('ไท', 'NOUN'), ('ซึ่ง', 'CCONJ'), ('เป็น', 'VERB'), ('กลุ่มย่อย', 'NOUN'), ('ของ', 'ADP'), ('ตระกูล', 'NOUN'), ('ภาษา', 'NOUN'), ('ข', 'NOUN'), ('ร้า', 'NOUN'), ('-', 'PUNCT'), ('ไท', 'NOUN'), ('และ', 'CCONJ'), ('เป็น', 'VERB'), ('ภาษาราชการ', 'NOUN'), ('และ', 'CCONJ'), ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศ', 'NOUN'), ('ไทย', 'NOUN'), ('[', 'NOUN'), ('3', 'NUM'), ('][', 'NOUN'), ('4', 'NUM'), (']', 'NOUN')] - elif pos_tagger == 'pythainlp_perceptron_orchid': - assert tokens_tagged == [('ภาษาไทย', 'NPRP'), ('หรือ', 'JCRG'), ('ภาษาไทย', 'NPRP'), ('กลาง', 'VATT'), ('เป็น', 'VSTA'), ('ภาษา', 'NCMN'), ('ใน', 'RPRE'), ('กลุ่ม', 'NCMN'), ('ภาษา', 'NCMN'), ('ไท', 'NCMN'), ('ซึ่ง', 'PREL'), ('เป็น', 'VSTA'), ('กลุ่มย่อย', 'NCMN'), ('ของ', 'RPRE'), ('ตระกูล', 'NCMN'), ('ภาษา', 'NCMN'), ('ข', 'NCMN'), ('ร้า', 'NCMN'), ('-', 'PUNC'), ('ไท', 'NCMN'), ('และ', 'JCRG'), ('เป็น', 'VSTA'), ('ภาษาราชการ', 'NCMN'), ('และ', 'JCRG'), ('ภาษาประจำชาติ', 'NCMN'), ('ของ', 'RPRE'), ('ประเทศ', 'NCMN'), ('ไทย', 'NPRP'), ('[', 'NCMN'), ('3', 'NCNM'), ('][', 'PUNC'), ('4', 'NCNM'), (']', 'CMTR')] - assert tokens_tagged_universal == [('ภาษาไทย', 'PROPN'), ('หรือ', 'CCONJ'), ('ภาษาไทย', 'PROPN'), ('กลาง', 'ADJ'), ('เป็น', 'VERB'), ('ภาษา', 'NOUN'), ('ใน', 'ADP'), ('กลุ่ม', 'NOUN'), ('ภาษา', 'NOUN'), ('ไท', 'NOUN'), ('ซึ่ง', 'SCONJ'), ('เป็น', 'VERB'), ('กลุ่มย่อย', 'NOUN'), ('ของ', 'ADP'), ('ตระกูล', 'NOUN'), ('ภาษา', 'NOUN'), ('ข', 'NOUN'), ('ร้า', 'NOUN'), ('-', 'PUNCT'), ('ไท', 'NOUN'), ('และ', 'CCONJ'), ('เป็น', 'VERB'), ('ภาษาราชการ', 'NOUN'), ('และ', 'CCONJ'), ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศ', 'NOUN'), ('ไทย', 'PROPN'), ('[', 'NOUN'), ('3', 'NOUN/NUM'), ('][', 'PUNCT'), ('4', 'NOUN/NUM'), (']', 'NOUN')] - elif pos_tagger == 'pythainlp_perceptron_pud': - assert tokens_tagged == tokens_tagged_universal == [('ภาษาไทย', 'NOUN'), ('หรือ', 'CCONJ'), ('ภาษาไทย', 'NOUN'), ('กลาง', 'NOUN'), ('เป็น', 'AUX'), ('ภาษา', 'NOUN'), ('ใน', 'ADP'), ('กลุ่ม', 'NOUN'), ('ภาษา', 'NOUN'), ('ไท', 'PROPN'), ('ซึ่ง', 'DET'), ('เป็น', 'AUX'), ('กลุ่มย่อย', 'NOUN'), ('ของ', 'ADP'), ('ตระกูล', 'NOUN'), ('ภาษา', 'NOUN'), ('ข', 'NOUN'), ('ร้า', 'NOUN'), ('-', 'PUNCT'), ('ไท', 'PROPN'), ('และ', 'CCONJ'), ('เป็น', 'AUX'), ('ภาษาราชการ', 'NOUN'), ('และ', 'CCONJ'), ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศ', 'NOUN'), ('ไทย', 'PROPN'), ('[', 'NOUN'), ('3', 'NUM'), ('][', 'NOUN'), ('4', 'NUM'), (']', 'NOUN')] - else: - tests_lang_util_skipped = True - elif lang == 'bod': - assert tokens_tagged == [('བོད་', 'PROPN'), ('ཀྱི་', 'PART'), ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'NO_POS'), ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'NO_POS'), ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'PART'), ('ས་ཁུལ་', 'OTHER'), ('བལ་ཡུལ', 'PROPN'), ('།', 'PUNCT'), ('འབྲུག་', 'NOUN'), ('དང་', 'NO_POS'), ('འབྲས་ལྗོངས', 'OTHER'), ('།', 'PUNCT')] - assert tokens_tagged_universal == [('བོད་', 'PROPN'), ('ཀྱི་', 'PART'), ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'X'), ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'X'), ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'PART'), ('ས་ཁུལ་', 'X'), ('བལ་ཡུལ', 'PROPN'), ('།', 'PUNCT'), ('འབྲུག་', 'NOUN'), ('དང་', 'X'), ('འབྲས་ལྗོངས', 'X'), ('།', 'PUNCT')] - elif lang == 'ukr': - assert tokens_tagged == [('Украї́нська', 'ADJF'), ('мо́ва', 'ADJF'), ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'), ('[', 'PNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'UNKN'), ('ˈmɔwɑ̽', 'UNKN'), (']', 'PNCT'), (',', 'PNCT'), ('історичні', 'ADJF'), ('назви', 'NOUN'), ('—', 'PNCT'), ('ру́ська', 'ADJF'), ('[', 'PNCT'), ('10', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('11', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('12', 'NUMB'), (']', 'PNCT'), ('[', 'PNCT'), ('*', 'PNCT'), ('1', 'NUMB'), (']', 'PNCT'), (')', 'PNCT'), ('—', 'PNCT'), ('національна', 'ADJF'), ('мова', 'NOUN'), ('українців', 'NOUN'), ('.', 'PNCT')] - assert tokens_tagged_universal == [('Украї́нська', 'ADJ'), ('мо́ва', 'ADJ'), ('(', 'PUNCT'), ('МФА', 'SYM/X'), (':', 'PUNCT'), ('[', 'PUNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'SYM/X'), ('ˈmɔwɑ̽', 'SYM/X'), (']', 'PUNCT'), (',', 'PUNCT'), ('історичні', 'ADJ'), ('назви', 'NOUN'), ('—', 'PUNCT'), ('ру́ська', 'ADJ'), ('[', 'PUNCT'), ('10', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('11', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('12', 'NUM'), (']', 'PUNCT'), ('[', 'PUNCT'), ('*', 'PUNCT'), ('1', 'NUM'), (']', 'PUNCT'), (')', 'PUNCT'), ('—', 'PUNCT'), ('національна', 'ADJ'), ('мова', 'NOUN'), ('українців', 'NOUN'), ('.', 'PUNCT')] - elif lang == 'vie': - assert tokens_tagged == [('Tiếng', 'N'), ('Việt', 'Np'), (',', 'CH'), ('cũng', 'R'), ('gọi là', 'X'), ('tiếng', 'N'), ('Việt Nam', 'Np'), ('[', 'V'), ('9', 'M'), (']', 'CH'), ('hay', 'C'), ('Việt ngữ', 'V'), ('là', 'V'), ('ngôn ngữ', 'N'), ('của', 'E'), ('người', 'Nc'), ('Việt', 'Np'), ('và', 'C'), ('là', 'V'), ('ngôn ngữ', 'N'), ('chính thức', 'A'), ('tại', 'E'), ('Việt Nam', 'Np'), ('.', 'CH')] - assert tokens_tagged_universal == [('Tiếng', 'NOUN'), ('Việt', 'PROPN'), (',', 'PUNCT'), ('cũng', 'X'), ('gọi là', 'X'), ('tiếng', 'NOUN'), ('Việt Nam', 'PROPN'), ('[', 'VERB'), ('9', 'NUM'), (']', 'PUNCT'), ('hay', 'CCONJ'), ('Việt ngữ', 'VERB'), ('là', 'VERB'), ('ngôn ngữ', 'NOUN'), ('của', 'ADP'), ('người', 'NOUN'), ('Việt', 'PROPN'), ('và', 'CCONJ'), ('là', 'VERB'), ('ngôn ngữ', 'NOUN'), ('chính thức', 'ADJ'), ('tại', 'ADP'), ('Việt Nam', 'PROPN'), ('.', 'PUNCT')] - else: - raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + # Tagged + tags_orig = ['_TEST'] + tokens_tagged = wl_pos_tagging.wl_pos_tag( + main, + inputs = wl_texts.to_tokens(['test'], lang = lang, tags = tags_orig), + lang = lang, + pos_tagger = pos_tagger + ) + tags_tagged = [token.tag for token in tokens_tagged] - if tests_lang_util_skipped: - raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(pos_tagger) + assert tags_tagged == tags_orig if __name__ == '__main__': for lang, pos_tagger in test_pos_taggers_local: diff --git a/tests/tests_nlp/test_sentence_tokenization.py b/tests/tests_nlp/test_sentence_tokenization.py index 8450b99e8..88bfaa208 100644 --- a/tests/tests_nlp/test_sentence_tokenization.py +++ b/tests/tests_nlp/test_sentence_tokenization.py @@ -20,7 +20,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import wl_sentence_tokenization, wl_word_tokenization +from wordless.wl_nlp import wl_sentence_tokenization, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_misc _, is_macos, _ = wl_misc.check_os() @@ -68,63 +68,65 @@ def test_sentence_tokenize(lang, sentence_tokenizer): tests_lang_util_skipped = False - if lang == 'ces': - assert sentences == ['Čeština neboli český jazyk je západoslovanský jazyk, nejbližší slovenštině, poté lužické srbštině a polštině.', 'Patří mezi slovanské jazyky, do rodiny jazyků indoevropských.', 'Čeština se vyvinula ze západních nářečí praslovanštiny na konci 10. století.', 'Je částečně ovlivněná latinou a němčinou.', 'Česky psaná literatura se objevuje od 14. století.', 'První písemné památky jsou však již z 12. století.'] - elif lang == 'dan': - assert sentences == ['Dansk er et østnordisk sprog indenfor den germanske gren af den indoeuropæiske sprogfamilie.', 'Det danske sprog tales af ca. seks millioner mennesker, hovedsageligt i Danmark, men også i Sydslesvig, på Færøerne og Grønland.', '[1] Dansk er tæt beslægtet med norsk, svensk og islandsk, og sproghistorisk har dansk været stærkt påvirket af plattysk.'] - elif lang == 'nld': - assert sentences == ['Het Nederlands is een West-Germaanse taal, de meest gebruikte taal in Nederland en België, de officiële taal van Suriname en een van de drie officiële talen van België.', 'Binnen het Koninkrijk der Nederlanden is het Nederlands ook een officiële taal van Aruba, Curaçao en Sint-Maarten.', 'Het Nederlands is na Engels en Duits de meest gesproken Germaanse taal.'] - elif lang.startswith('eng_') or lang == 'other': - assert sentences == ['English is a West Germanic language in the Indo-European language family.', 'Originating in early medieval England,[3][4][5] today English is both the most spoken language in the world[6] and the third most spoken native language, after Mandarin Chinese and Spanish.', '[7] English is the most widely learned second language and is either the official language or one of the official languages in 59 sovereign states.', 'There are more people who have learned English as a second language than there are native speakers.', 'As of 2005, it was estimated that there were over two billion speakers of English.', '[8]'] - elif lang == 'est': - assert sentences == ['Eesti keelel on kaks suuremat murderühma (põhjaeesti ja lõunaeesti), mõnes käsitluses eristatakse ka kirderanniku murdeid eraldi murderühmana.', 'Liikumisvõimaluste laienemine ning põhjaeesti keskmurde alusel loodud normitud eesti kirjakeele kasutus on põhjustanud murdeerinevuste taandumise.'] - elif lang == 'fin': - assert sentences == ['Suomen kieli eli suomi on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli, jota puhuvat pääosin suomalaiset.', 'Suomessa suomen kieltä puhuu äidinkielenään 4,8 miljoonaa ja toisena kielenään 0,5 miljoonaa ihmistä.', 'Suurimmat suomea puhuvat vähemmistöt ovat Ruotsissa, Norjassa ja Venäjällä.'] - elif lang == 'fra': - assert sentences == ['Le français est une langue indo-européenne de la famille des langues romanes dont les locuteurs sont appelés francophones.', 'Elle est parfois surnommée la langue de Molière.'] - elif lang.startswith('deu_'): - assert sentences == ['Das Deutsche ist eine plurizentrische Sprache, enthält also mehrere Standardvarietäten in verschiedenen Regionen.', 'Ihr Sprachgebiet umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig.', 'Außerdem ist Deutsch eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z.', 'B. in Rumänien und Südafrika sowie Nationalsprache im afrikanischen Namibia.', 'Deutsch ist die meistgesprochene Muttersprache in der Europäischen Union (EU).', '[26]'] - elif lang == 'ell': - assert sentences == ['Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και αποτελεί το μοναδικό μέλος του ελληνικού κλάδου, ενώ είναι η επίσημη γλώσσα της Ελλάδας και της Κύπρου.', 'Ανήκει επίσης στο βαλκανικό γλωσσικό δεσμό.', 'Στην ελληνική γλώσσα, έχουμε γραπτά κείμενα ήδη από τον 15ο αιώνα π.Χ.. Σαν Παγκόσμια Ημέρα Ελληνικής Γλώσσας, κάθε έτος, έχει καθιερωθεί η 9η Φεβρουαρίου.', 'Έχει την μακροβιότερη καταγεγραμμένη ιστορία από οποιαδήποτε άλλη ζωντανή ινδοευρωπαϊκή γλώσσα με τουλάχιστον 3.400 χρόνια γραπτής ιστορίας.', '[10] Γράφεται με το ελληνικό αλφάβητο, το οποίο χρησιμοποιείται αδιάκοπα (αρχικά με τοπικές παραλλαγές, μετέπειτα υπό μια, ενιαία μορφή) εδώ και περίπου 2.600 χρόνια.', '[11][12] Προηγουμένως η ελληνική γλώσσα γραφόταν με τη Γραμμική Β και το κυπριακό συλλαβάριο.', '[13] Το ελληνικό αλφάβητο προέρχεται από το φοινικικό αλφάβητο, με κάποιες προσαρμογές.', 'Στο ελληνικό αλφάβητο βασίζεται το λατινικό, το κυριλλικό, το αρμενικό, το κοπτικό, το γοτθικό και πολλά άλλα αλφάβητα.'] - elif lang == 'ita': - assert sentences == ["L'italiano ([itaˈljaːno][Nota 1] ascoltaⓘ) è una lingua romanza parlata principalmente in Italia.", "Per ragioni storiche e geografiche, l'italiano è la lingua romanza meno divergente dal latino.", '[2][3][4][Nota 2]'] - elif lang == 'khm': - assert sentences == ['ភាសាខ្មែរ គឺជាភាសាកំណើតរបស់ជនជាតិខ្មែរនិងជាភាសាផ្លូវការរបស់ប្រទេសកម្ពុជា។', 'ភាសាសំស្ក្រឹតនិងភាសាបាលីបាន\u200bជួយបង្កើតខេមរភាសា ព្រោះភាសាខ្មែរបានខ្ចីពាក្យច្រើនពីភាសាទាំងពីរនេះ។', '\u200bមានអក្សរក្រមវែងជាងគេនៅលើពិភពលោក ។', '\u200b វាជាភាសាមួយដ៏ចំណាស់\u200b ដែលប្រហែលជាមានដើមកំណើតតាំងតែពី\u200b២០០០ឆ្នាំមុនមកម៉្លេះ។'] - elif lang == 'lao': - assert sentences == ['ພາສາລາວ (Lao: ລາວ, [láːw] ຫຼື ພາສາລາວ, [pʰáːsǎːláːw]) ເປັນພາສາຕະກູນໄທ-ກະໄດຂອງຄົນລາວ ໂດຍມີຄົນເວົ້າໃນປະເທດລາວ ເຊິ່ງເປັນພາສາລັດຖະການຂອງສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ ຂອງປະຊາກອນປະມານ 7 ລ້ານຄົນ ແລະໃນພື້ນທີ່ພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດໄທທີ່ມີຄົນເວົ້າປະມານ 23 ລ້ານຄົນ ທາງລັດຖະບານປະເທດໄທມີການສະໜັບສະໜຸນໃຫ້ເອີ້ນພາສາລາວຖິ່ນໄທວ່າ ພາສາລາວຖິ່ນອີສານ ນອກຈາກນີ້, ຢູ່ທາງພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດກຳປູເຈຍກໍມີຄົນເວົ້າພາສາລາວຄືກັນ.', 'ພາສາລາວເປັນແມ່ຂອງຄົນເຊື້ອຊາດລາວທັງຢູ່ພາຍໃນແລະຕ່າງປະເທດ ທັງເປັນພາສາກາງຂອງພົນລະເມືອງໃນປະເທດລາວທີ່ມີພາສາອື່ນອີກຫຼາຍພາສາ ເຊິ່ງບາງພາສາບໍ່ມີຄວາມກ່ຽວຂ້ອງກັບພາສານີ້[3] .'] - elif lang == 'mal': - assert sentences == ['ഇന്ത്യയിൽ കേരള സംസ്ഥാനത്തിലും കേന്ദ്രഭരണപ്രദേശങ്ങളായ ലക്ഷദ്വീപിലും പോണ്ടിച്ചേരിയുടെ ഭാഗമായ മാഹിയിലും തമിഴ്നാട്ടിലെ കന്യാകുമാരി ജില്ലയിലും നീലഗിരി ജില്ലയിലെ ഗൂഡല്ലൂർ താലൂക്കിലും സംസാരിക്കപ്പെടുന്ന ഭാഷയാണ് മലയാളം.', 'ഇതു ദ്രാവിഡ ഭാഷാ കുടുംബത്തിൽപ്പെടുന്നു.', 'ഇന്ത്യയിൽ ശ്രേഷ്ഠഭാഷാ പദവി ലഭിക്കുന്ന അഞ്ചാമത്തെ ഭാഷയാണ് മലയാളം[5].', '2013 മെയ് 23-നു ചേർന്ന കേന്ദ്രമന്ത്രിസഭായോഗമാണ് മലയാളത്തെ ശ്രേഷ്ഠഭാഷയായി അംഗീകരിച്ചത്.', 'ക്ലാസിക്കൽ ലാംഗ്വേജ് എന്ന പദവിയാണ് ലൽകിയത്.', 'അതിനു മലയാളത്തിൽ നൽകിയ വിവർത്തനം ആണ് ശ്രേഷ്ഠഭാഷ എന്നത്.', 'വാസ്തവത്തിൽ ഇത് അത്രശരിയായ വിവർത്തനമോ ശരിയായ പ്രയോഗമോ അല്ല.', 'ശ്രേഷ്ഠം മോശം എന്ന നിലയിൽ ഭാഷകളെ വിലയിരുത്തുന്നത് ശാസ്ത്രീയമായ കാര്യമല്ല.', 'ഭാഷകളിൽ ശ്രേഷ്ഠമെന്നും അല്ലാത്തത് എന്നുമുള്ള വിഭജനം ഇല്ല.', 'ഇന്ത്യൻ ഭരണഘടനയിലെ എട്ടാം ഷെഡ്യൂളിൽ ഉൾപ്പെടുത്തിയിരിക്കുന്ന ഇന്ത്യയിലെ ഇരുപത്തിരണ്ട് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നാണ് മലയാളം[6].', 'മലയാള ഭാഷ കൈരളി,മലനാട് ഭാഷ എന്നും അറിയപ്പെടുന്നു.കേരള സംസ്ഥാനത്തിലെ ഭരണഭാഷയും കൂടിയാണ്\u200c മലയാളം.', 'കേരളത്തിനും ലക്ഷദ്വീപിനും പുറമേ തമിഴ്നാട്ടിലെ ചില ഭാഗങ്ങളിലും കന്യാകുമാരി ജില്ല, നീലഗിരി ജില്ല കർണാടകയുടെ ദക്ഷിണ കന്നഡ ജില്ല, കൊടഗ് ഭാഗങ്ങളിലും ഗൾഫ് രാജ്യങ്ങൾ, സിംഗപ്പൂർ, മലേഷ്യ എന്നിവിടങ്ങളിലെ കേരളീയ പൈതൃകമുള്ള അനേകം ജനങ്ങളും മലയാളം ഉപയോഗിച്ചുപോരുന്നു.ദേശീയ ഭാഷയായി ഉൾപ്പെടുത്തിയത് മറ്റ് 21 ഭാഷകളുടേതുപോലെ തനതായ വ്യക്തിത്വം ഉള്ളതിനാലാണ്.', 'മലയാള ഭാഷയുടെ ഉല്പത്തിയും പ്രാചീനതയും സംബന്ധിച്ച കാര്യങ്ങൾ ഇന്നും അവ്യക്തമാണ്.', 'പഴയ തമിഴിനും മുൻപത്തെ മൂലദ്രാവിഡമാണ് മലയാളത്തിന്റെ ആദ്യ രൂപം എന്നു കരുതുന്നു.', 'യു.എ.ഇ-യിലെ നാല് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നു മലയാളമാണ്.', '[അവലംബം ആവശ്യമാണ്]'] - elif lang == 'nob': - assert sentences == ['Bokmål er en av to offisielle målformer av norsk skriftspråk, hvorav den andre er nynorsk.', 'I skrift har 87,3% bokmål som hovedmål i skolen.', '[1] Etter skriftreformene av riksmål i 1987 og bokmål i 1981 og 2005 er det lite som skiller bokmål og riksmål i alminnelig bruk.'] - elif lang == 'nno': - assert sentences == ['Nynorsk, før 1929 offisielt kalla landsmål, er sidan jamstillingsvedtaket av 12. mai 1885 ei av dei to offisielle målformene av norsk; den andre forma er bokmål.', 'Nynorsk vert i dag nytta av om lag 10–15% av innbyggjarane i Noreg.', '[1][2] Skriftspråket er basert på nynorsk talemål, det vil seie dei moderne norske dialektane til skilnad frå gamalnorsk og mellomnorsk.', 'Når ein seier at nokon snakkar nynorsk, meiner ein helst at dei snakkar nynorsk normaltalemål.', 'Dei færraste dialekttalande nordmenn seier at dei snakkar nynorsk, men det er ikkje uvanleg i kjerneområda til nynorsken.', 'Dette tilhøvet mellom tale og skrift ligg bak målrørsla sitt slagord sidan 1970-talet: «Snakk dialekt – skriv nynorsk!» Nynorske dialektar vart snakka over heile landet, men det er berre på Vestlandet utanom dei største byene og i dei austlandske fjellbygdene at skriftspråket står sterkt.', 'Det vil seie at dei fleste dialekttalarane har bokmål som det primære skriftspråket sitt.'] - elif lang == 'pol': - assert sentences == ['Język polski, polszczyzna – język z grupy zachodniosłowiańskiej (do której należą również czeski, kaszubski, słowacki i języki łużyckie), stanowiącej część rodziny indoeuropejskiej.', 'Jest językiem urzędowym w Polsce oraz należy do oficjalnych języków Unii Europejskiej.'] - elif lang.startswith('por_'): - assert sentences == ['A língua portuguesa, também designada português, é uma língua indo-europeia românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.', 'Com a criação do Reino de Portugal em 1139 e a expansão para o sul na sequência da Reconquista, deu-se a difusão da língua pelas terras conquistadas e mais tarde, com as descobertas portuguesas, para o Brasil, África e outras partes do mundo.', '[8] O português foi usado, naquela época, não somente nas cidades conquistadas pelos portugueses, mas também por muitos governantes locais nos seus contatos com outros estrangeiros poderosos.', 'Especialmente nessa altura a língua portuguesa também influenciou várias línguas.', '[9]'] - elif lang == 'rus': - assert sentences == ['Ру́сский язы́к (МФА: [ˈruskʲɪi̯ jɪˈzɨk]ⓘ)[~ 3][⇨] — язык восточнославянской группы славянской ветви индоевропейской языковой семьи, национальный язык русского народа.', 'Является одним из наиболее распространённых языков мира — восьмым среди всех языков мира по общей численности говорящих[5] и седьмым по численности владеющих им как родным (2022)[2].', 'Русский является также самым распространённым славянским языком[8] и самым распространённым языком в Европе — географически и по числу носителей языка как родного[6].'] - elif lang == 'slv': - assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po hrvaškem jezikoslovcu Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848.', 'Do takrat smo uporabljali bohoričico.'] - elif lang == 'spa': - assert sentences == ['El español o castellano es una lengua romance procedente del latín hablado, perteneciente a la familia de lenguas indoeuropeas.', 'Forma parte del grupo ibérico y es originaria de Castilla, reino medieval de la península ibérica.', 'Se conoce también informalmente como castillan.', '1\u200b33\u200b34\u200b en algunas áreas rurales e indígenas de América,35\u200b pues el español se empezó a enseñar poco después de la incorporación de los nuevos territorios a la Corona de Castilla.36\u200b37\u200b38\u200b39\u200b40\u200b41\u200b'] - elif lang == 'swe': - assert sentences == ['Svenska (svenska\u2009(info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.', 'I övriga Finland talas det som modersmål framförallt i de finlandssvenska kustområdena i Österbotten, Åboland och Nyland.', 'En liten minoritet svenskspråkiga finns även i Estland.', 'Svenska är nära besläktat och i hög grad ömsesidigt begripligt med danska och norska.', 'De andra nordiska språken, isländska och färöiska, är mindre ömsesidigt begripliga med svenska.', 'Liksom de övriga nordiska språken härstammar svenskan från en gren av fornnordiska, vilket var det språk som talades av de germanska folken i Skandinavien.'] - elif lang == 'tha': - if sentence_tokenizer == 'pythainlp_crfcut': - assert sentences == ['ภาษาไทย หรือ ภาษาไทยกลาง เป็นภาษาในกลุ่มภาษาไท ซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า-ไท และเป็นภาษาราชการ และภาษาประจำชาติของประเทศไทย[3][4] มีการสันนิษฐานว่าภาษาในตระกูลนี้มีถิ่นกำเนิดจากทางตอนใต้ของประเทศจีน และนักภาษาศาสตร์บางส่วนเสนอว่า ภาษาไทยน่าจะมีความเชื่อมโยงกับตระกูลภาษาออสโตร-เอเชียติก', 'ตระกูลภาษาออสโตรนีเซียน และตระกูลภาษาจีน-ทิเบต'] - elif sentence_tokenizer == 'pythainlp_thaisumcut': - assert sentences == ['ภาษาไทย', 'หรือ ภาษาไทยกลาง เป็นภาษาในกลุ่มภาษาไท', 'ซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า-ไท และเป็นภาษาราชการ', 'และภาษาประจำชาติของประเทศไทย[3][4] มีการสันนิษฐานว่าภาษาในตระกูลนี้มีถิ่นกำเนิดจากทางตอนใต้ของประเทศจีน', 'และนักภาษาศาสตร์บางส่วนเสนอว่า ภาษาไทยน่าจะมีความเชื่อมโยงกับตระกูลภาษาออสโตร-เอเชียติก ตระกูลภาษาออสโตรนีเซียน และตระกูลภาษาจีน-ทิเบต'] - else: - tests_lang_util_skipped = True - elif lang == 'bod': - assert sentences == ['བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་ཉེ་འཁོར་གྱི་ས་ཁུལ་བལ་ཡུལ། འབྲུག་དང་འབྲས་ལྗོངས། ལ་དྭགས་ནས་ལྷོ་མོན་རོང་སོགས་སུ་བེད་སྤྱོད་བྱེད་པའི་སྐད་ཡིག་དེ།', 'ད་ཆར་ཡོངས་གྲགས་སུ་བོད་ཀྱི་ཡུལ་གྲུ་སྟོད་སྨད་བར་གསུམ་ལ་ལྟོས་ཏེ་ནང་གསེས་རིགས་གསུམ་དུ་ཕྱེ་བ་སྟེ།', 'སྟོད་དབུས་གཙང་གི་སྐད་དང་། བར་ཁམས་པའི་སྐད་དང་། སྨད་ཨ་མདོའི་སྐད་རྣམས་སོ།', 'བོད་སྐད་ནི་ཧོར་སོག་ལ་སོགས་པ་གྲངས་ཉུང་མི་རིགས་གཞན་པ་ཁག་ཅིག་གིས་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད་པར་མ་ཟད། བལ་ཡུལ་དང་། འབྲས་ལྗོངས། འབྲུག་ཡུལ་། རྒྱ་གར་ཤར་དང་བྱང་རྒྱུད་མངའ་སྡེ་ཁག་གཅིག་བཅས་ཀྱི་རྒྱལ་ཁབ་རྣམས་སུའང་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད།'] - elif lang == 'tur': - assert sentences == ["Türkçe ya da Türk dili, Güneydoğu Avrupa ve Batı Asya'da konuşulan, Türk dilleri dil ailesine ait sondan eklemeli bir dil.", '[12] Türk dilleri ailesinin Oğuz dilleri grubundan bir Batı Oğuz dili olan Osmanlı Türkçesinin devamını oluşturur.', "Dil, başta Türkiye olmak üzere Balkanlar, Ege Adaları, Kıbrıs ve Orta Doğu'yu kapsayan eski Osmanlı İmparatorluğu coğrafyasında konuşulur.", "[12] Ethnologue'a göre Türkçe, yaklaşık 83 milyon konuşuru ile dünyada en çok konuşulan 16.", 'dildir.', "[13] Türkçe Türkiye, Kıbrıs Cumhuriyeti ve Kuzey Kıbrıs'ta ulusal resmî dil statüsüne sahiptir.", '[12]'] - elif lang == 'vie': - assert sentences == ['Tiếng Việt, cũng gọi là tiếng Việt Nam[9] hay Việt ngữ là ngôn ngữ của người Việt và là ngôn ngữ chính thức tại Việt Nam.', 'Đây là tiếng mẹ đẻ của khoảng 85% dân cư Việt Nam cùng với hơn 4 triệu người Việt kiều.', 'Tiếng Việt còn là ngôn ngữ thứ hai của các dân tộc thiểu số tại Việt Nam và là ngôn ngữ dân tộc thiểu số được công nhận tại Cộng hòa Séc.'] - else: - raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + match lang: + case 'ces': + assert sentences == ['Čeština neboli český jazyk je západoslovanský jazyk, nejbližší slovenštině, poté lužické srbštině a polštině.', 'Patří mezi slovanské jazyky, do rodiny jazyků indoevropských.', 'Čeština se vyvinula ze západních nářečí praslovanštiny na konci 10. století.', 'Je částečně ovlivněná latinou a němčinou.', 'Česky psaná literatura se objevuje od 14. století.', 'První písemné památky jsou však již z 12. století.'] + case 'dan': + assert sentences == ['Dansk er et østnordisk sprog indenfor den germanske gren af den indoeuropæiske sprogfamilie.', 'Det danske sprog tales af ca. seks millioner mennesker, hovedsageligt i Danmark, men også i Sydslesvig, på Færøerne og Grønland.', '[1] Dansk er tæt beslægtet med norsk, svensk og islandsk, og sproghistorisk har dansk været stærkt påvirket af plattysk.'] + case 'nld': + assert sentences == ['Het Nederlands is een West-Germaanse taal, de meest gebruikte taal in Nederland en België, de officiële taal van Suriname en een van de drie officiële talen van België.', 'Binnen het Koninkrijk der Nederlanden is het Nederlands ook een officiële taal van Aruba, Curaçao en Sint-Maarten.', 'Het Nederlands is na Engels en Duits de meest gesproken Germaanse taal.'] + case 'eng_gb' | 'eng_us' | 'other': + assert sentences == ['English is a West Germanic language in the Indo-European language family.', 'Originating in early medieval England,[3][4][5] today English is both the most spoken language in the world[6] and the third most spoken native language, after Mandarin Chinese and Spanish.', '[7] English is the most widely learned second language and is either the official language or one of the official languages in 59 sovereign states.', 'There are more people who have learned English as a second language than there are native speakers.', 'As of 2005, it was estimated that there were over two billion speakers of English.', '[8]'] + case 'est': + assert sentences == ['Eesti keelel on kaks suuremat murderühma (põhjaeesti ja lõunaeesti), mõnes käsitluses eristatakse ka kirderanniku murdeid eraldi murderühmana.', 'Liikumisvõimaluste laienemine ning põhjaeesti keskmurde alusel loodud normitud eesti kirjakeele kasutus on põhjustanud murdeerinevuste taandumise.'] + case 'fin': + assert sentences == ['Suomen kieli eli suomi on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli, jota puhuvat pääosin suomalaiset.', 'Suomessa suomen kieltä puhuu äidinkielenään 4,8 miljoonaa ja toisena kielenään 0,5 miljoonaa ihmistä.', 'Suurimmat suomea puhuvat vähemmistöt ovat Ruotsissa, Norjassa ja Venäjällä.'] + case 'fra': + assert sentences == ['Le français est une langue indo-européenne de la famille des langues romanes dont les locuteurs sont appelés francophones.', 'Elle est parfois surnommée la langue de Molière.'] + case 'deu_at' | 'deu_de' | 'deu_ch': + assert sentences == ['Das Deutsche ist eine plurizentrische Sprache, enthält also mehrere Standardvarietäten in verschiedenen Regionen.', 'Ihr Sprachgebiet umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig.', 'Außerdem ist Deutsch eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z.', 'B. in Rumänien und Südafrika sowie Nationalsprache im afrikanischen Namibia.', 'Deutsch ist die meistgesprochene Muttersprache in der Europäischen Union (EU).', '[26]'] + case 'ell': + assert sentences == ['Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και αποτελεί το μοναδικό μέλος του ελληνικού κλάδου, ενώ είναι η επίσημη γλώσσα της Ελλάδας και της Κύπρου.', 'Ανήκει επίσης στο βαλκανικό γλωσσικό δεσμό.', 'Στην ελληνική γλώσσα, έχουμε γραπτά κείμενα ήδη από τον 15ο αιώνα π.Χ.. Σαν Παγκόσμια Ημέρα Ελληνικής Γλώσσας, κάθε έτος, έχει καθιερωθεί η 9η Φεβρουαρίου.', 'Έχει την μακροβιότερη καταγεγραμμένη ιστορία από οποιαδήποτε άλλη ζωντανή ινδοευρωπαϊκή γλώσσα με τουλάχιστον 3.400 χρόνια γραπτής ιστορίας.', '[10] Γράφεται με το ελληνικό αλφάβητο, το οποίο χρησιμοποιείται αδιάκοπα (αρχικά με τοπικές παραλλαγές, μετέπειτα υπό μια, ενιαία μορφή) εδώ και περίπου 2.600 χρόνια.', '[11][12] Προηγουμένως η ελληνική γλώσσα γραφόταν με τη Γραμμική Β και το κυπριακό συλλαβάριο.', '[13] Το ελληνικό αλφάβητο προέρχεται από το φοινικικό αλφάβητο, με κάποιες προσαρμογές.', 'Στο ελληνικό αλφάβητο βασίζεται το λατινικό, το κυριλλικό, το αρμενικό, το κοπτικό, το γοτθικό και πολλά άλλα αλφάβητα.'] + case 'ita': + assert sentences == ["L'italiano ([itaˈljaːno][Nota 1] ascoltaⓘ) è una lingua romanza parlata principalmente in Italia.", "Per ragioni storiche e geografiche, l'italiano è la lingua romanza meno divergente dal latino.", '[2][3][4][Nota 2]'] + case 'khm': + assert sentences == ['ភាសាខ្មែរ គឺជាភាសាកំណើតរបស់ជនជាតិខ្មែរនិងជាភាសាផ្លូវការរបស់ប្រទេសកម្ពុជា។', 'ភាសាសំស្ក្រឹតនិងភាសាបាលីបាន\u200bជួយបង្កើតខេមរភាសា ព្រោះភាសាខ្មែរបានខ្ចីពាក្យច្រើនពីភាសាទាំងពីរនេះ។', '\u200bមានអក្សរក្រមវែងជាងគេនៅលើពិភពលោក ។', '\u200b វាជាភាសាមួយដ៏ចំណាស់\u200b ដែលប្រហែលជាមានដើមកំណើតតាំងតែពី\u200b២០០០ឆ្នាំមុនមកម៉្លេះ។'] + case 'lao': + assert sentences == ['ພາສາລາວ (Lao: ລາວ, [láːw] ຫຼື ພາສາລາວ, [pʰáːsǎːláːw]) ເປັນພາສາຕະກູນໄທ-ກະໄດຂອງຄົນລາວ ໂດຍມີຄົນເວົ້າໃນປະເທດລາວ ເຊິ່ງເປັນພາສາລັດຖະການຂອງສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ ຂອງປະຊາກອນປະມານ 7 ລ້ານຄົນ ແລະໃນພື້ນທີ່ພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດໄທທີ່ມີຄົນເວົ້າປະມານ 23 ລ້ານຄົນ ທາງລັດຖະບານປະເທດໄທມີການສະໜັບສະໜຸນໃຫ້ເອີ້ນພາສາລາວຖິ່ນໄທວ່າ ພາສາລາວຖິ່ນອີສານ ນອກຈາກນີ້, ຢູ່ທາງພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດກຳປູເຈຍກໍມີຄົນເວົ້າພາສາລາວຄືກັນ.', 'ພາສາລາວເປັນແມ່ຂອງຄົນເຊື້ອຊາດລາວທັງຢູ່ພາຍໃນແລະຕ່າງປະເທດ ທັງເປັນພາສາກາງຂອງພົນລະເມືອງໃນປະເທດລາວທີ່ມີພາສາອື່ນອີກຫຼາຍພາສາ ເຊິ່ງບາງພາສາບໍ່ມີຄວາມກ່ຽວຂ້ອງກັບພາສານີ້[3] .'] + case 'mal': + assert sentences == ['ഇന്ത്യയിൽ കേരള സംസ്ഥാനത്തിലും കേന്ദ്രഭരണപ്രദേശങ്ങളായ ലക്ഷദ്വീപിലും പോണ്ടിച്ചേരിയുടെ ഭാഗമായ മാഹിയിലും തമിഴ്നാട്ടിലെ കന്യാകുമാരി ജില്ലയിലും നീലഗിരി ജില്ലയിലെ ഗൂഡല്ലൂർ താലൂക്കിലും സംസാരിക്കപ്പെടുന്ന ഭാഷയാണ് മലയാളം.', 'ഇതു ദ്രാവിഡ ഭാഷാ കുടുംബത്തിൽപ്പെടുന്നു.', 'ഇന്ത്യയിൽ ശ്രേഷ്ഠഭാഷാ പദവി ലഭിക്കുന്ന അഞ്ചാമത്തെ ഭാഷയാണ് മലയാളം[5].', '2013 മെയ് 23-നു ചേർന്ന കേന്ദ്രമന്ത്രിസഭായോഗമാണ് മലയാളത്തെ ശ്രേഷ്ഠഭാഷയായി അംഗീകരിച്ചത്.', 'ക്ലാസിക്കൽ ലാംഗ്വേജ് എന്ന പദവിയാണ് ലൽകിയത്.', 'അതിനു മലയാളത്തിൽ നൽകിയ വിവർത്തനം ആണ് ശ്രേഷ്ഠഭാഷ എന്നത്.', 'വാസ്തവത്തിൽ ഇത് അത്രശരിയായ വിവർത്തനമോ ശരിയായ പ്രയോഗമോ അല്ല.', 'ശ്രേഷ്ഠം മോശം എന്ന നിലയിൽ ഭാഷകളെ വിലയിരുത്തുന്നത് ശാസ്ത്രീയമായ കാര്യമല്ല.', 'ഭാഷകളിൽ ശ്രേഷ്ഠമെന്നും അല്ലാത്തത് എന്നുമുള്ള വിഭജനം ഇല്ല.', 'ഇന്ത്യൻ ഭരണഘടനയിലെ എട്ടാം ഷെഡ്യൂളിൽ ഉൾപ്പെടുത്തിയിരിക്കുന്ന ഇന്ത്യയിലെ ഇരുപത്തിരണ്ട് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നാണ് മലയാളം[6].', 'മലയാള ഭാഷ കൈരളി,മലനാട് ഭാഷ എന്നും അറിയപ്പെടുന്നു.കേരള സംസ്ഥാനത്തിലെ ഭരണഭാഷയും കൂടിയാണ്\u200c മലയാളം.', 'കേരളത്തിനും ലക്ഷദ്വീപിനും പുറമേ തമിഴ്നാട്ടിലെ ചില ഭാഗങ്ങളിലും കന്യാകുമാരി ജില്ല, നീലഗിരി ജില്ല കർണാടകയുടെ ദക്ഷിണ കന്നഡ ജില്ല, കൊടഗ് ഭാഗങ്ങളിലും ഗൾഫ് രാജ്യങ്ങൾ, സിംഗപ്പൂർ, മലേഷ്യ എന്നിവിടങ്ങളിലെ കേരളീയ പൈതൃകമുള്ള അനേകം ജനങ്ങളും മലയാളം ഉപയോഗിച്ചുപോരുന്നു.ദേശീയ ഭാഷയായി ഉൾപ്പെടുത്തിയത് മറ്റ് 21 ഭാഷകളുടേതുപോലെ തനതായ വ്യക്തിത്വം ഉള്ളതിനാലാണ്.', 'മലയാള ഭാഷയുടെ ഉല്പത്തിയും പ്രാചീനതയും സംബന്ധിച്ച കാര്യങ്ങൾ ഇന്നും അവ്യക്തമാണ്.', 'പഴയ തമിഴിനും മുൻപത്തെ മൂലദ്രാവിഡമാണ് മലയാളത്തിന്റെ ആദ്യ രൂപം എന്നു കരുതുന്നു.', 'യു.എ.ഇ-യിലെ നാല് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നു മലയാളമാണ്.', '[അവലംബം ആവശ്യമാണ്]'] + case 'nob': + assert sentences == ['Bokmål er en av to offisielle målformer av norsk skriftspråk, hvorav den andre er nynorsk.', 'I skrift har 87,3% bokmål som hovedmål i skolen.', '[1] Etter skriftreformene av riksmål i 1987 og bokmål i 1981 og 2005 er det lite som skiller bokmål og riksmål i alminnelig bruk.'] + case 'nno': + assert sentences == ['Nynorsk, før 1929 offisielt kalla landsmål, er sidan jamstillingsvedtaket av 12. mai 1885 ei av dei to offisielle målformene av norsk; den andre forma er bokmål.', 'Nynorsk vert i dag nytta av om lag 10–15% av innbyggjarane i Noreg.', '[1][2] Skriftspråket er basert på nynorsk talemål, det vil seie dei moderne norske dialektane til skilnad frå gamalnorsk og mellomnorsk.', 'Når ein seier at nokon snakkar nynorsk, meiner ein helst at dei snakkar nynorsk normaltalemål.', 'Dei færraste dialekttalande nordmenn seier at dei snakkar nynorsk, men det er ikkje uvanleg i kjerneområda til nynorsken.', 'Dette tilhøvet mellom tale og skrift ligg bak målrørsla sitt slagord sidan 1970-talet: «Snakk dialekt – skriv nynorsk!» Nynorske dialektar vart snakka over heile landet, men det er berre på Vestlandet utanom dei største byene og i dei austlandske fjellbygdene at skriftspråket står sterkt.', 'Det vil seie at dei fleste dialekttalarane har bokmål som det primære skriftspråket sitt.'] + case 'pol': + assert sentences == ['Język polski, polszczyzna – język z grupy zachodniosłowiańskiej (do której należą również czeski, kaszubski, słowacki i języki łużyckie), stanowiącej część rodziny indoeuropejskiej.', 'Jest językiem urzędowym w Polsce oraz należy do oficjalnych języków Unii Europejskiej.'] + case 'por_br' | 'por_pt': + assert sentences == ['A língua portuguesa, também designada português, é uma língua indo-europeia românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.', 'Com a criação do Reino de Portugal em 1139 e a expansão para o sul na sequência da Reconquista, deu-se a difusão da língua pelas terras conquistadas e mais tarde, com as descobertas portuguesas, para o Brasil, África e outras partes do mundo.', '[8] O português foi usado, naquela época, não somente nas cidades conquistadas pelos portugueses, mas também por muitos governantes locais nos seus contatos com outros estrangeiros poderosos.', 'Especialmente nessa altura a língua portuguesa também influenciou várias línguas.', '[9]'] + case 'rus': + assert sentences == ['Ру́сский язы́к (МФА: [ˈruskʲɪi̯ jɪˈzɨk]ⓘ)[~ 3][⇨] — язык восточнославянской группы славянской ветви индоевропейской языковой семьи, национальный язык русского народа.', 'Является одним из наиболее распространённых языков мира — восьмым среди всех языков мира по общей численности говорящих[5] и седьмым по численности владеющих им как родным (2022)[2].', 'Русский является также самым распространённым славянским языком[8] и самым распространённым языком в Европе — географически и по числу носителей языка как родного[6].'] + case 'slv': + assert sentences == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,5 (dva in pol) milijona govorcev po svetu, od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov, ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica, pisava imenovana po hrvaškem jezikoslovcu Ljudevitu Gaju, ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848.', 'Do takrat smo uporabljali bohoričico.'] + case 'spa': + assert sentences == ['El español o castellano es una lengua romance procedente del latín hablado, perteneciente a la familia de lenguas indoeuropeas.', 'Forma parte del grupo ibérico y es originaria de Castilla, reino medieval de la península ibérica.', 'Se conoce también informalmente como castillan.', '1\u200b33\u200b34\u200b en algunas áreas rurales e indígenas de América,35\u200b pues el español se empezó a enseñar poco después de la incorporación de los nuevos territorios a la Corona de Castilla.36\u200b37\u200b38\u200b39\u200b40\u200b41\u200b'] + case 'swe': + assert sentences == ['Svenska (svenska\u2009(info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.', 'I övriga Finland talas det som modersmål framförallt i de finlandssvenska kustområdena i Österbotten, Åboland och Nyland.', 'En liten minoritet svenskspråkiga finns även i Estland.', 'Svenska är nära besläktat och i hög grad ömsesidigt begripligt med danska och norska.', 'De andra nordiska språken, isländska och färöiska, är mindre ömsesidigt begripliga med svenska.', 'Liksom de övriga nordiska språken härstammar svenskan från en gren av fornnordiska, vilket var det språk som talades av de germanska folken i Skandinavien.'] + case 'tha': + match sentence_tokenizer: + case 'pythainlp_crfcut': + assert sentences == ['ภาษาไทย หรือ ภาษาไทยกลาง เป็นภาษาในกลุ่มภาษาไท ซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า-ไท และเป็นภาษาราชการ และภาษาประจำชาติของประเทศไทย[3][4] มีการสันนิษฐานว่าภาษาในตระกูลนี้มีถิ่นกำเนิดจากทางตอนใต้ของประเทศจีน และนักภาษาศาสตร์บางส่วนเสนอว่า ภาษาไทยน่าจะมีความเชื่อมโยงกับตระกูลภาษาออสโตร-เอเชียติก', 'ตระกูลภาษาออสโตรนีเซียน และตระกูลภาษาจีน-ทิเบต'] + case 'pythainlp_thaisumcut': + assert sentences == ['ภาษาไทย', 'หรือ ภาษาไทยกลาง เป็นภาษาในกลุ่มภาษาไท', 'ซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า-ไท และเป็นภาษาราชการ', 'และภาษาประจำชาติของประเทศไทย[3][4] มีการสันนิษฐานว่าภาษาในตระกูลนี้มีถิ่นกำเนิดจากทางตอนใต้ของประเทศจีน', 'และนักภาษาศาสตร์บางส่วนเสนอว่า ภาษาไทยน่าจะมีความเชื่อมโยงกับตระกูลภาษาออสโตร-เอเชียติก ตระกูลภาษาออสโตรนีเซียน และตระกูลภาษาจีน-ทิเบต'] + case _: + tests_lang_util_skipped = True + case 'bod': + assert sentences == ['བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་ཉེ་འཁོར་གྱི་ས་ཁུལ་བལ་ཡུལ། འབྲུག་དང་འབྲས་ལྗོངས། ལ་དྭགས་ནས་ལྷོ་མོན་རོང་སོགས་སུ་བེད་སྤྱོད་བྱེད་པའི་སྐད་ཡིག་དེ།', 'ད་ཆར་ཡོངས་གྲགས་སུ་བོད་ཀྱི་ཡུལ་གྲུ་སྟོད་སྨད་བར་གསུམ་ལ་ལྟོས་ཏེ་ནང་གསེས་རིགས་གསུམ་དུ་ཕྱེ་བ་སྟེ།', 'སྟོད་དབུས་གཙང་གི་སྐད་དང་། བར་ཁམས་པའི་སྐད་དང་། སྨད་ཨ་མདོའི་སྐད་རྣམས་སོ།', 'བོད་སྐད་ནི་ཧོར་སོག་ལ་སོགས་པ་གྲངས་ཉུང་མི་རིགས་གཞན་པ་ཁག་ཅིག་གིས་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད་པར་མ་ཟད། བལ་ཡུལ་དང་། འབྲས་ལྗོངས། འབྲུག་ཡུལ་། རྒྱ་གར་ཤར་དང་བྱང་རྒྱུད་མངའ་སྡེ་ཁག་གཅིག་བཅས་ཀྱི་རྒྱལ་ཁབ་རྣམས་སུའང་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད།'] + case 'tur': + assert sentences == ["Türkçe ya da Türk dili, Güneydoğu Avrupa ve Batı Asya'da konuşulan, Türk dilleri dil ailesine ait sondan eklemeli bir dil.", '[12] Türk dilleri ailesinin Oğuz dilleri grubundan bir Batı Oğuz dili olan Osmanlı Türkçesinin devamını oluşturur.', "Dil, başta Türkiye olmak üzere Balkanlar, Ege Adaları, Kıbrıs ve Orta Doğu'yu kapsayan eski Osmanlı İmparatorluğu coğrafyasında konuşulur.", "[12] Ethnologue'a göre Türkçe, yaklaşık 83 milyon konuşuru ile dünyada en çok konuşulan 16.", 'dildir.', "[13] Türkçe Türkiye, Kıbrıs Cumhuriyeti ve Kuzey Kıbrıs'ta ulusal resmî dil statüsüne sahiptir.", '[12]'] + case 'vie': + assert sentences == ['Tiếng Việt, cũng gọi là tiếng Việt Nam[9] hay Việt ngữ là ngôn ngữ của người Việt và là ngôn ngữ chính thức tại Việt Nam.', 'Đây là tiếng mẹ đẻ của khoảng 85% dân cư Việt Nam cùng với hơn 4 triệu người Việt kiều.', 'Tiếng Việt còn là ngôn ngữ thứ hai của các dân tộc thiểu số tại Việt Nam và là ngôn ngữ dân tộc thiểu số được công nhận tại Cộng hòa Séc.'] + case _: + raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) if tests_lang_util_skipped: raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(sentence_tokenizer) @@ -154,58 +156,59 @@ def test_sentence_seg_tokenize(lang): if lang not in ['tha']: assert len(sentence_segs) > 1 - if lang == 'ces': - assert sentence_segs == ['Čeština neboli český jazyk je západoslovanský jazyk,', 'nejbližší slovenštině,', 'poté lužické srbštině a polštině.', 'Patří mezi slovanské jazyky,', 'do rodiny jazyků indoevropských.', 'Čeština se vyvinula ze západních nářečí praslovanštiny na konci 10.', 'století.', 'Je částečně ovlivněná latinou a němčinou.', 'Česky psaná literatura se objevuje od 14.', 'století.', 'První písemné památky jsou však již z 12.', 'století.'] - elif lang == 'dan': - assert sentence_segs == ['Dansk er et østnordisk sprog indenfor den germanske gren af den indoeuropæiske sprogfamilie.', 'Det danske sprog tales af ca.', 'seks millioner mennesker,', 'hovedsageligt i Danmark,', 'men også i Sydslesvig,', 'på Færøerne og Grønland.', '[1] Dansk er tæt beslægtet med norsk,', 'svensk og islandsk,', 'og sproghistorisk har dansk været stærkt påvirket af plattysk.'] - elif lang == 'nld': - assert sentence_segs == ['Het Nederlands is een West-Germaanse taal,', 'de meest gebruikte taal in Nederland en België,', 'de officiële taal van Suriname en een van de drie officiële talen van België.', 'Binnen het Koninkrijk der Nederlanden is het Nederlands ook een officiële taal van Aruba,', 'Curaçao en Sint-Maarten.', 'Het Nederlands is na Engels en Duits de meest gesproken Germaanse taal.'] - elif lang.startswith('eng_') or lang == 'other': - assert sentence_segs == ['English is a West Germanic language in the Indo-European language family.', 'Originating in early medieval England,', '[3][4][5] today English is both the most spoken language in the world[6] and the third most spoken native language,', 'after Mandarin Chinese and Spanish.', '[7] English is the most widely learned second language and is either the official language or one of the official languages in 59 sovereign states.', 'There are more people who have learned English as a second language than there are native speakers.', 'As of 2005,', 'it was estimated that there were over two billion speakers of English.', '[8]'] - elif lang == 'est': - assert sentence_segs == ['Eesti keelel on kaks suuremat murderühma (põhjaeesti ja lõunaeesti),', 'mõnes käsitluses eristatakse ka kirderanniku murdeid eraldi murderühmana.', 'Liikumisvõimaluste laienemine ning põhjaeesti keskmurde alusel loodud normitud eesti kirjakeele kasutus on põhjustanud murdeerinevuste taandumise.'] - elif lang == 'fin': - assert sentence_segs == ['Suomen kieli eli suomi on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli,', 'jota puhuvat pääosin suomalaiset.', 'Suomessa suomen kieltä puhuu äidinkielenään 4,', '8 miljoonaa ja toisena kielenään 0,', '5 miljoonaa ihmistä.', 'Suurimmat suomea puhuvat vähemmistöt ovat Ruotsissa,', 'Norjassa ja Venäjällä.'] - elif lang == 'fra': - assert sentence_segs == ['Le français est une langue indo-européenne de la famille des langues romanes dont les locuteurs sont appelés francophones.', 'Elle est parfois surnommée la langue de Molière.'] - elif lang.startswith('deu_'): - assert sentence_segs == ['Das Deutsche ist eine plurizentrische Sprache,', 'enthält also mehrere Standardvarietäten in verschiedenen Regionen.', 'Ihr Sprachgebiet umfasst Deutschland,', 'Österreich,', 'die Deutschschweiz,', 'Liechtenstein,', 'Luxemburg,', 'Ostbelgien,', 'Südtirol,', 'das Elsass und Lothringen sowie Nordschleswig.', 'Außerdem ist Deutsch eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern,', 'z.', 'B.', 'in Rumänien und Südafrika sowie Nationalsprache im afrikanischen Namibia.', 'Deutsch ist die meistgesprochene Muttersprache in der Europäischen Union (EU).', '[26]'] - elif lang == 'ell': - assert sentence_segs == ['Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και αποτελεί το μοναδικό μέλος του ελληνικού κλάδου,', 'ενώ είναι η επίσημη γλώσσα της Ελλάδας και της Κύπρου.', 'Ανήκει επίσης στο βαλκανικό γλωσσικό δεσμό.', 'Στην ελληνική γλώσσα,', 'έχουμε γραπτά κείμενα ήδη από τον 15ο αιώνα π.', 'Χ..', 'Σαν Παγκόσμια Ημέρα Ελληνικής Γλώσσας,', 'κάθε έτος,', 'έχει καθιερωθεί η 9η Φεβρουαρίου.', 'Έχει την μακροβιότερη καταγεγραμμένη ιστορία από οποιαδήποτε άλλη ζωντανή ινδοευρωπαϊκή γλώσσα με τουλάχιστον 3.', '400 χρόνια γραπτής ιστορίας.', '[10] Γράφεται με το ελληνικό αλφάβητο,', 'το οποίο χρησιμοποιείται αδιάκοπα (αρχικά με τοπικές παραλλαγές,', 'μετέπειτα υπό μια,', 'ενιαία μορφή) εδώ και περίπου 2.', '600 χρόνια.', '[11][12] Προηγουμένως η ελληνική γλώσσα γραφόταν με τη Γραμμική Β και το κυπριακό συλλαβάριο.', '[13] Το ελληνικό αλφάβητο προέρχεται από το φοινικικό αλφάβητο,', 'με κάποιες προσαρμογές.', 'Στο ελληνικό αλφάβητο βασίζεται το λατινικό,', 'το κυριλλικό,', 'το αρμενικό,', 'το κοπτικό,', 'το γοτθικό και πολλά άλλα αλφάβητα.'] - elif lang == 'ita': - assert sentence_segs == ["L'italiano ([itaˈljaːno][Nota 1] ascoltaⓘ) è una lingua romanza parlata principalmente in Italia.", 'Per ragioni storiche e geografiche,', "l'italiano è la lingua romanza meno divergente dal latino.", '[2][3][4][Nota 2]'] - elif lang == 'khm': - assert sentence_segs == ['ភាសាខ្មែរ គឺជាភាសាកំណើតរបស់ជនជាតិខ្មែរនិងជាភាសាផ្លូវការរបស់ប្រទេសកម្ពុជា។', 'ភាសាសំស្ក្រឹតនិងភាសាបាលីបាន\u200bជួយបង្កើតខេមរភាសា ព្រោះភាសាខ្មែរបានខ្ចីពាក្យច្រើនពីភាសាទាំងពីរនេះ។', '\u200bមានអក្សរក្រមវែងជាងគេនៅលើពិភពលោក ។', '\u200b វាជាភាសាមួយដ៏ចំណាស់\u200b ដែលប្រហែលជាមានដើមកំណើតតាំងតែពី\u200b២០០០ឆ្នាំមុនមកម៉្លេះ។'] - elif lang == 'lao': - assert sentence_segs == ['ພາສາລາວ (Lao:', 'ລາວ,', '[láːw] ຫຼື ພາສາລາວ,', '[pʰáːsǎːláːw]) ເປັນພາສາຕະກູນໄທ-ກະໄດຂອງຄົນລາວ ໂດຍມີຄົນເວົ້າໃນປະເທດລາວ ເຊິ່ງເປັນພາສາລັດຖະການຂອງສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ ຂອງປະຊາກອນປະມານ 7 ລ້ານຄົນ ແລະໃນພື້ນທີ່ພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດໄທທີ່ມີຄົນເວົ້າປະມານ 23 ລ້ານຄົນ ທາງລັດຖະບານປະເທດໄທມີການສະໜັບສະໜຸນໃຫ້ເອີ້ນພາສາລາວຖິ່ນໄທວ່າ ພາສາລາວຖິ່ນອີສານ ນອກຈາກນີ້,', 'ຢູ່ທາງພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດກຳປູເຈຍກໍມີຄົນເວົ້າພາສາລາວຄືກັນ.', 'ພາສາລາວເປັນແມ່ຂອງຄົນເຊື້ອຊາດລາວທັງຢູ່ພາຍໃນແລະຕ່າງປະເທດ ທັງເປັນພາສາກາງຂອງພົນລະເມືອງໃນປະເທດລາວທີ່ມີພາສາອື່ນອີກຫຼາຍພາສາ ເຊິ່ງບາງພາສາບໍ່ມີຄວາມກ່ຽວຂ້ອງກັບພາສານີ້[3] .'] - elif lang == 'mal': - assert sentence_segs == ['ഇന്ത്യയിൽ കേരള സംസ്ഥാനത്തിലും കേന്ദ്രഭരണപ്രദേശങ്ങളായ ലക്ഷദ്വീപിലും പോണ്ടിച്ചേരിയുടെ ഭാഗമായ മാഹിയിലും തമിഴ്നാട്ടിലെ കന്യാകുമാരി ജില്ലയിലും നീലഗിരി ജില്ലയിലെ ഗൂഡല്ലൂർ താലൂക്കിലും സംസാരിക്കപ്പെടുന്ന ഭാഷയാണ് മലയാളം.', 'ഇതു ദ്രാവിഡ ഭാഷാ കുടുംബത്തിൽപ്പെടുന്നു.', 'ഇന്ത്യയിൽ ശ്രേഷ്ഠഭാഷാ പദവി ലഭിക്കുന്ന അഞ്ചാമത്തെ ഭാഷയാണ് മലയാളം[5].', '2013 മെയ് 23-നു ചേർന്ന കേന്ദ്രമന്ത്രിസഭായോഗമാണ് മലയാളത്തെ ശ്രേഷ്ഠഭാഷയായി അംഗീകരിച്ചത്.', 'ക്ലാസിക്കൽ ലാംഗ്വേജ് എന്ന പദവിയാണ് ലൽകിയത്.', 'അതിനു മലയാളത്തിൽ നൽകിയ വിവർത്തനം ആണ് ശ്രേഷ്ഠഭാഷ എന്നത്.', 'വാസ്തവത്തിൽ ഇത് അത്രശരിയായ വിവർത്തനമോ ശരിയായ പ്രയോഗമോ അല്ല.', 'ശ്രേഷ്ഠം മോശം എന്ന നിലയിൽ ഭാഷകളെ വിലയിരുത്തുന്നത് ശാസ്ത്രീയമായ കാര്യമല്ല.', 'ഭാഷകളിൽ ശ്രേഷ്ഠമെന്നും അല്ലാത്തത് എന്നുമുള്ള വിഭജനം ഇല്ല.', 'ഇന്ത്യൻ ഭരണഘടനയിലെ എട്ടാം ഷെഡ്യൂളിൽ ഉൾപ്പെടുത്തിയിരിക്കുന്ന ഇന്ത്യയിലെ ഇരുപത്തിരണ്ട് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നാണ് മലയാളം[6].', 'മലയാള ഭാഷ കൈരളി,', 'മലനാട് ഭാഷ എന്നും അറിയപ്പെടുന്നു.', 'കേരള സംസ്ഥാനത്തിലെ ഭരണഭാഷയും കൂടിയാണ്\u200c മലയാളം.', 'കേരളത്തിനും ലക്ഷദ്വീപിനും പുറമേ തമിഴ്നാട്ടിലെ ചില ഭാഗങ്ങളിലും കന്യാകുമാരി ജില്ല,', 'നീലഗിരി ജില്ല കർണാടകയുടെ ദക്ഷിണ കന്നഡ ജില്ല,', 'കൊടഗ് ഭാഗങ്ങളിലും ഗൾഫ് രാജ്യങ്ങൾ,', 'സിംഗപ്പൂർ,', 'മലേഷ്യ എന്നിവിടങ്ങളിലെ കേരളീയ പൈതൃകമുള്ള അനേകം ജനങ്ങളും മലയാളം ഉപയോഗിച്ചുപോരുന്നു.', 'ദേശീയ ഭാഷയായി ഉൾപ്പെടുത്തിയത് മറ്റ് 21 ഭാഷകളുടേതുപോലെ തനതായ വ്യക്തിത്വം ഉള്ളതിനാലാണ്.', 'മലയാള ഭാഷയുടെ ഉല്പത്തിയും പ്രാചീനതയും സംബന്ധിച്ച കാര്യങ്ങൾ ഇന്നും അവ്യക്തമാണ്.', 'പഴയ തമിഴിനും മുൻപത്തെ മൂലദ്രാവിഡമാണ് മലയാളത്തിന്റെ ആദ്യ രൂപം എന്നു കരുതുന്നു.', 'യു.', 'എ.', 'ഇ-യിലെ നാല് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നു മലയാളമാണ്.', '[അവലംബം ആവശ്യമാണ്]'] - elif lang == 'nob': - assert sentence_segs == ['Bokmål er en av to offisielle målformer av norsk skriftspråk,', 'hvorav den andre er nynorsk.', 'I skrift har 87,', '3% bokmål som hovedmål i skolen.', '[1] Etter skriftreformene av riksmål i 1987 og bokmål i 1981 og 2005 er det lite som skiller bokmål og riksmål i alminnelig bruk.'] - elif lang == 'nno': - assert sentence_segs == ['Nynorsk,', 'før 1929 offisielt kalla landsmål,', 'er sidan jamstillingsvedtaket av 12.', 'mai 1885 ei av dei to offisielle målformene av norsk;', 'den andre forma er bokmål.', 'Nynorsk vert i dag nytta av om lag 10–15% av innbyggjarane i Noreg.', '[1][2] Skriftspråket er basert på nynorsk talemål,', 'det vil seie dei moderne norske dialektane til skilnad frå gamalnorsk og mellomnorsk.', 'Når ein seier at nokon snakkar nynorsk,', 'meiner ein helst at dei snakkar nynorsk normaltalemål.', 'Dei færraste dialekttalande nordmenn seier at dei snakkar nynorsk,', 'men det er ikkje uvanleg i kjerneområda til nynorsken.', 'Dette tilhøvet mellom tale og skrift ligg bak målrørsla sitt slagord sidan 1970-talet:', '«Snakk dialekt – skriv nynorsk!', '» Nynorske dialektar vart snakka over heile landet,', 'men det er berre på Vestlandet utanom dei største byene og i dei austlandske fjellbygdene at skriftspråket står sterkt.', 'Det vil seie at dei fleste dialekttalarane har bokmål som det primære skriftspråket sitt.'] - elif lang == 'pol': - assert sentence_segs == ['Język polski,', 'polszczyzna – język z grupy zachodniosłowiańskiej (do której należą również czeski,', 'kaszubski,', 'słowacki i języki łużyckie),', 'stanowiącej część rodziny indoeuropejskiej.', 'Jest językiem urzędowym w Polsce oraz należy do oficjalnych języków Unii Europejskiej.'] - elif lang.startswith('por_'): - assert sentence_segs == ['A língua portuguesa,', 'também designada português,', 'é uma língua indo-europeia românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.', 'Com a criação do Reino de Portugal em 1139 e a expansão para o sul na sequência da Reconquista,', 'deu-se a difusão da língua pelas terras conquistadas e mais tarde,', 'com as descobertas portuguesas,', 'para o Brasil,', 'África e outras partes do mundo.', '[8] O português foi usado,', 'naquela época,', 'não somente nas cidades conquistadas pelos portugueses,', 'mas também por muitos governantes locais nos seus contatos com outros estrangeiros poderosos.', 'Especialmente nessa altura a língua portuguesa também influenciou várias línguas.', '[9]'] - elif lang == 'rus': - assert sentence_segs == ['Ру́сский язы́к (МФА:', '[ˈruskʲɪi̯ jɪˈzɨk]ⓘ)[~ 3][⇨] — язык восточнославянской группы славянской ветви индоевропейской языковой семьи,', 'национальный язык русского народа.', 'Является одним из наиболее распространённых языков мира — восьмым среди всех языков мира по общей численности говорящих[5] и седьмым по численности владеющих им как родным (2022)[2].', 'Русский является также самым распространённым славянским языком[8] и самым распространённым языком в Европе — географически и по числу носителей языка как родного[6].'] - elif lang == 'slv': - assert sentence_segs == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore,', 'ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,', '5 (dva in pol) milijona govorcev po svetu,', 'od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov,', 'ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica,', 'pisava imenovana po hrvaškem jezikoslovcu Ljudevitu Gaju,', 'ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848.', 'Do takrat smo uporabljali bohoričico.'] - elif lang == 'spa': - assert sentence_segs == ['El español o castellano es una lengua romance procedente del latín hablado,', 'perteneciente a la familia de lenguas indoeuropeas.', 'Forma parte del grupo ibérico y es originaria de Castilla,', 'reino medieval de la península ibérica.', 'Se conoce también informalmente como castillan.', '1\u200b33\u200b34\u200b en algunas áreas rurales e indígenas de América,', '35\u200b pues el español se empezó a enseñar poco después de la incorporación de los nuevos territorios a la Corona de Castilla.', '36\u200b37\u200b38\u200b39\u200b40\u200b41\u200b'] - elif lang == 'swe': - assert sentence_segs == ['Svenska (svenska\u2009(info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk,', 'men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.', 'I övriga Finland talas det som modersmål framförallt i de finlandssvenska kustområdena i Österbotten,', 'Åboland och Nyland.', 'En liten minoritet svenskspråkiga finns även i Estland.', 'Svenska är nära besläktat och i hög grad ömsesidigt begripligt med danska och norska.', 'De andra nordiska språken,', 'isländska och färöiska,', 'är mindre ömsesidigt begripliga med svenska.', 'Liksom de övriga nordiska språken härstammar svenskan från en gren av fornnordiska,', 'vilket var det språk som talades av de germanska folken i Skandinavien.'] - elif lang == 'tha': - assert sentence_segs == ['ภาษาไทย หรือ ภาษาไทยกลาง เป็นภาษาในกลุ่มภาษาไท ซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า-ไท และเป็นภาษาราชการ และภาษาประจำชาติของประเทศไทย[3][4] มีการสันนิษฐานว่าภาษาในตระกูลนี้มีถิ่นกำเนิดจากทางตอนใต้ของประเทศจีน และนักภาษาศาสตร์บางส่วนเสนอว่า ภาษาไทยน่าจะมีความเชื่อมโยงกับตระกูลภาษาออสโตร-เอเชียติก ตระกูลภาษาออสโตรนีเซียน และตระกูลภาษาจีน-ทิเบต'] - elif lang == 'bod': - assert sentence_segs == ['བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་ཉེ་འཁོར་གྱི་ས་ཁུལ་བལ་ཡུལ།', 'འབྲུག་དང་འབྲས་ལྗོངས།', 'ལ་དྭགས་ནས་ལྷོ་མོན་རོང་སོགས་སུ་བེད་སྤྱོད་བྱེད་པའི་སྐད་ཡིག་དེ།', 'ད་ཆར་ཡོངས་གྲགས་སུ་བོད་ཀྱི་ཡུལ་གྲུ་སྟོད་སྨད་བར་གསུམ་ལ་ལྟོས་ཏེ་ནང་གསེས་རིགས་གསུམ་དུ་ཕྱེ་བ་སྟེ།', 'སྟོད་དབུས་གཙང་གི་སྐད་དང་།', 'བར་ཁམས་པའི་སྐད་དང་།', 'སྨད་ཨ་མདོའི་སྐད་རྣམས་སོ།', 'བོད་སྐད་ནི་ཧོར་སོག་ལ་སོགས་པ་གྲངས་ཉུང་མི་རིགས་གཞན་པ་ཁག་ཅིག་གིས་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད་པར་མ་ཟད།', 'བལ་ཡུལ་དང་།', 'འབྲས་ལྗོངས།', 'འབྲུག་ཡུལ་།', 'རྒྱ་གར་ཤར་དང་བྱང་རྒྱུད་མངའ་སྡེ་ཁག་གཅིག་བཅས་ཀྱི་རྒྱལ་ཁབ་རྣམས་སུའང་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད།'] - elif lang == 'tur': - assert sentence_segs == ['Türkçe ya da Türk dili,', "Güneydoğu Avrupa ve Batı Asya'da konuşulan,", 'Türk dilleri dil ailesine ait sondan eklemeli bir dil.', '[12] Türk dilleri ailesinin Oğuz dilleri grubundan bir Batı Oğuz dili olan Osmanlı Türkçesinin devamını oluşturur.', 'Dil,', 'başta Türkiye olmak üzere Balkanlar,', 'Ege Adaları,', "Kıbrıs ve Orta Doğu'yu kapsayan eski Osmanlı İmparatorluğu coğrafyasında konuşulur.", "[12] Ethnologue'a göre Türkçe,", 'yaklaşık 83 milyon konuşuru ile dünyada en çok konuşulan 16.', 'dildir.', '[13] Türkçe Türkiye,', "Kıbrıs Cumhuriyeti ve Kuzey Kıbrıs'ta ulusal resmî dil statüsüne sahiptir.", '[12]'] - elif lang == 'vie': - assert sentence_segs == ['Tiếng Việt,', 'cũng gọi là tiếng Việt Nam[9] hay Việt ngữ là ngôn ngữ của người Việt và là ngôn ngữ chính thức tại Việt Nam.', 'Đây là tiếng mẹ đẻ của khoảng 85% dân cư Việt Nam cùng với hơn 4 triệu người Việt kiều.', 'Tiếng Việt còn là ngôn ngữ thứ hai của các dân tộc thiểu số tại Việt Nam và là ngôn ngữ dân tộc thiểu số được công nhận tại Cộng hòa Séc.'] - else: - raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + match lang: + case 'ces': + assert sentence_segs == ['Čeština neboli český jazyk je západoslovanský jazyk,', 'nejbližší slovenštině,', 'poté lužické srbštině a polštině.', 'Patří mezi slovanské jazyky,', 'do rodiny jazyků indoevropských.', 'Čeština se vyvinula ze západních nářečí praslovanštiny na konci 10.', 'století.', 'Je částečně ovlivněná latinou a němčinou.', 'Česky psaná literatura se objevuje od 14.', 'století.', 'První písemné památky jsou však již z 12.', 'století.'] + case 'dan': + assert sentence_segs == ['Dansk er et østnordisk sprog indenfor den germanske gren af den indoeuropæiske sprogfamilie.', 'Det danske sprog tales af ca.', 'seks millioner mennesker,', 'hovedsageligt i Danmark,', 'men også i Sydslesvig,', 'på Færøerne og Grønland.', '[1] Dansk er tæt beslægtet med norsk,', 'svensk og islandsk,', 'og sproghistorisk har dansk været stærkt påvirket af plattysk.'] + case 'nld': + assert sentence_segs == ['Het Nederlands is een West-Germaanse taal,', 'de meest gebruikte taal in Nederland en België,', 'de officiële taal van Suriname en een van de drie officiële talen van België.', 'Binnen het Koninkrijk der Nederlanden is het Nederlands ook een officiële taal van Aruba,', 'Curaçao en Sint-Maarten.', 'Het Nederlands is na Engels en Duits de meest gesproken Germaanse taal.'] + case 'eng_gb' | 'eng_us' | 'other': + assert sentence_segs == ['English is a West Germanic language in the Indo-European language family.', 'Originating in early medieval England,', '[3][4][5] today English is both the most spoken language in the world[6] and the third most spoken native language,', 'after Mandarin Chinese and Spanish.', '[7] English is the most widely learned second language and is either the official language or one of the official languages in 59 sovereign states.', 'There are more people who have learned English as a second language than there are native speakers.', 'As of 2005,', 'it was estimated that there were over two billion speakers of English.', '[8]'] + case 'est': + assert sentence_segs == ['Eesti keelel on kaks suuremat murderühma (põhjaeesti ja lõunaeesti),', 'mõnes käsitluses eristatakse ka kirderanniku murdeid eraldi murderühmana.', 'Liikumisvõimaluste laienemine ning põhjaeesti keskmurde alusel loodud normitud eesti kirjakeele kasutus on põhjustanud murdeerinevuste taandumise.'] + case 'fin': + assert sentence_segs == ['Suomen kieli eli suomi on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli,', 'jota puhuvat pääosin suomalaiset.', 'Suomessa suomen kieltä puhuu äidinkielenään 4,', '8 miljoonaa ja toisena kielenään 0,', '5 miljoonaa ihmistä.', 'Suurimmat suomea puhuvat vähemmistöt ovat Ruotsissa,', 'Norjassa ja Venäjällä.'] + case 'fra': + assert sentence_segs == ['Le français est une langue indo-européenne de la famille des langues romanes dont les locuteurs sont appelés francophones.', 'Elle est parfois surnommée la langue de Molière.'] + case 'deu_at' | 'deu_de' | 'deu_ch': + assert sentence_segs == ['Das Deutsche ist eine plurizentrische Sprache,', 'enthält also mehrere Standardvarietäten in verschiedenen Regionen.', 'Ihr Sprachgebiet umfasst Deutschland,', 'Österreich,', 'die Deutschschweiz,', 'Liechtenstein,', 'Luxemburg,', 'Ostbelgien,', 'Südtirol,', 'das Elsass und Lothringen sowie Nordschleswig.', 'Außerdem ist Deutsch eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern,', 'z.', 'B.', 'in Rumänien und Südafrika sowie Nationalsprache im afrikanischen Namibia.', 'Deutsch ist die meistgesprochene Muttersprache in der Europäischen Union (EU).', '[26]'] + case 'ell': + assert sentence_segs == ['Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και αποτελεί το μοναδικό μέλος του ελληνικού κλάδου,', 'ενώ είναι η επίσημη γλώσσα της Ελλάδας και της Κύπρου.', 'Ανήκει επίσης στο βαλκανικό γλωσσικό δεσμό.', 'Στην ελληνική γλώσσα,', 'έχουμε γραπτά κείμενα ήδη από τον 15ο αιώνα π.', 'Χ..', 'Σαν Παγκόσμια Ημέρα Ελληνικής Γλώσσας,', 'κάθε έτος,', 'έχει καθιερωθεί η 9η Φεβρουαρίου.', 'Έχει την μακροβιότερη καταγεγραμμένη ιστορία από οποιαδήποτε άλλη ζωντανή ινδοευρωπαϊκή γλώσσα με τουλάχιστον 3.', '400 χρόνια γραπτής ιστορίας.', '[10] Γράφεται με το ελληνικό αλφάβητο,', 'το οποίο χρησιμοποιείται αδιάκοπα (αρχικά με τοπικές παραλλαγές,', 'μετέπειτα υπό μια,', 'ενιαία μορφή) εδώ και περίπου 2.', '600 χρόνια.', '[11][12] Προηγουμένως η ελληνική γλώσσα γραφόταν με τη Γραμμική Β και το κυπριακό συλλαβάριο.', '[13] Το ελληνικό αλφάβητο προέρχεται από το φοινικικό αλφάβητο,', 'με κάποιες προσαρμογές.', 'Στο ελληνικό αλφάβητο βασίζεται το λατινικό,', 'το κυριλλικό,', 'το αρμενικό,', 'το κοπτικό,', 'το γοτθικό και πολλά άλλα αλφάβητα.'] + case 'ita': + assert sentence_segs == ["L'italiano ([itaˈljaːno][Nota 1] ascoltaⓘ) è una lingua romanza parlata principalmente in Italia.", 'Per ragioni storiche e geografiche,', "l'italiano è la lingua romanza meno divergente dal latino.", '[2][3][4][Nota 2]'] + case 'khm': + assert sentence_segs == ['ភាសាខ្មែរ គឺជាភាសាកំណើតរបស់ជនជាតិខ្មែរនិងជាភាសាផ្លូវការរបស់ប្រទេសកម្ពុជា។', 'ភាសាសំស្ក្រឹតនិងភាសាបាលីបាន\u200bជួយបង្កើតខេមរភាសា ព្រោះភាសាខ្មែរបានខ្ចីពាក្យច្រើនពីភាសាទាំងពីរនេះ។', '\u200bមានអក្សរក្រមវែងជាងគេនៅលើពិភពលោក ។', '\u200b វាជាភាសាមួយដ៏ចំណាស់\u200b ដែលប្រហែលជាមានដើមកំណើតតាំងតែពី\u200b២០០០ឆ្នាំមុនមកម៉្លេះ។'] + case 'lao': + assert sentence_segs == ['ພາສາລາວ (Lao:', 'ລາວ,', '[láːw] ຫຼື ພາສາລາວ,', '[pʰáːsǎːláːw]) ເປັນພາສາຕະກູນໄທ-ກະໄດຂອງຄົນລາວ ໂດຍມີຄົນເວົ້າໃນປະເທດລາວ ເຊິ່ງເປັນພາສາລັດຖະການຂອງສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ ຂອງປະຊາກອນປະມານ 7 ລ້ານຄົນ ແລະໃນພື້ນທີ່ພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດໄທທີ່ມີຄົນເວົ້າປະມານ 23 ລ້ານຄົນ ທາງລັດຖະບານປະເທດໄທມີການສະໜັບສະໜຸນໃຫ້ເອີ້ນພາສາລາວຖິ່ນໄທວ່າ ພາສາລາວຖິ່ນອີສານ ນອກຈາກນີ້,', 'ຢູ່ທາງພາກຕາເວັນອອກສຽງເໜືອຂອງປະເທດກຳປູເຈຍກໍມີຄົນເວົ້າພາສາລາວຄືກັນ.', 'ພາສາລາວເປັນແມ່ຂອງຄົນເຊື້ອຊາດລາວທັງຢູ່ພາຍໃນແລະຕ່າງປະເທດ ທັງເປັນພາສາກາງຂອງພົນລະເມືອງໃນປະເທດລາວທີ່ມີພາສາອື່ນອີກຫຼາຍພາສາ ເຊິ່ງບາງພາສາບໍ່ມີຄວາມກ່ຽວຂ້ອງກັບພາສານີ້[3] .'] + case 'mal': + assert sentence_segs == ['ഇന്ത്യയിൽ കേരള സംസ്ഥാനത്തിലും കേന്ദ്രഭരണപ്രദേശങ്ങളായ ലക്ഷദ്വീപിലും പോണ്ടിച്ചേരിയുടെ ഭാഗമായ മാഹിയിലും തമിഴ്നാട്ടിലെ കന്യാകുമാരി ജില്ലയിലും നീലഗിരി ജില്ലയിലെ ഗൂഡല്ലൂർ താലൂക്കിലും സംസാരിക്കപ്പെടുന്ന ഭാഷയാണ് മലയാളം.', 'ഇതു ദ്രാവിഡ ഭാഷാ കുടുംബത്തിൽപ്പെടുന്നു.', 'ഇന്ത്യയിൽ ശ്രേഷ്ഠഭാഷാ പദവി ലഭിക്കുന്ന അഞ്ചാമത്തെ ഭാഷയാണ് മലയാളം[5].', '2013 മെയ് 23-നു ചേർന്ന കേന്ദ്രമന്ത്രിസഭായോഗമാണ് മലയാളത്തെ ശ്രേഷ്ഠഭാഷയായി അംഗീകരിച്ചത്.', 'ക്ലാസിക്കൽ ലാംഗ്വേജ് എന്ന പദവിയാണ് ലൽകിയത്.', 'അതിനു മലയാളത്തിൽ നൽകിയ വിവർത്തനം ആണ് ശ്രേഷ്ഠഭാഷ എന്നത്.', 'വാസ്തവത്തിൽ ഇത് അത്രശരിയായ വിവർത്തനമോ ശരിയായ പ്രയോഗമോ അല്ല.', 'ശ്രേഷ്ഠം മോശം എന്ന നിലയിൽ ഭാഷകളെ വിലയിരുത്തുന്നത് ശാസ്ത്രീയമായ കാര്യമല്ല.', 'ഭാഷകളിൽ ശ്രേഷ്ഠമെന്നും അല്ലാത്തത് എന്നുമുള്ള വിഭജനം ഇല്ല.', 'ഇന്ത്യൻ ഭരണഘടനയിലെ എട്ടാം ഷെഡ്യൂളിൽ ഉൾപ്പെടുത്തിയിരിക്കുന്ന ഇന്ത്യയിലെ ഇരുപത്തിരണ്ട് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നാണ് മലയാളം[6].', 'മലയാള ഭാഷ കൈരളി,', 'മലനാട് ഭാഷ എന്നും അറിയപ്പെടുന്നു.', 'കേരള സംസ്ഥാനത്തിലെ ഭരണഭാഷയും കൂടിയാണ്\u200c മലയാളം.', 'കേരളത്തിനും ലക്ഷദ്വീപിനും പുറമേ തമിഴ്നാട്ടിലെ ചില ഭാഗങ്ങളിലും കന്യാകുമാരി ജില്ല,', 'നീലഗിരി ജില്ല കർണാടകയുടെ ദക്ഷിണ കന്നഡ ജില്ല,', 'കൊടഗ് ഭാഗങ്ങളിലും ഗൾഫ് രാജ്യങ്ങൾ,', 'സിംഗപ്പൂർ,', 'മലേഷ്യ എന്നിവിടങ്ങളിലെ കേരളീയ പൈതൃകമുള്ള അനേകം ജനങ്ങളും മലയാളം ഉപയോഗിച്ചുപോരുന്നു.', 'ദേശീയ ഭാഷയായി ഉൾപ്പെടുത്തിയത് മറ്റ് 21 ഭാഷകളുടേതുപോലെ തനതായ വ്യക്തിത്വം ഉള്ളതിനാലാണ്.', 'മലയാള ഭാഷയുടെ ഉല്പത്തിയും പ്രാചീനതയും സംബന്ധിച്ച കാര്യങ്ങൾ ഇന്നും അവ്യക്തമാണ്.', 'പഴയ തമിഴിനും മുൻപത്തെ മൂലദ്രാവിഡമാണ് മലയാളത്തിന്റെ ആദ്യ രൂപം എന്നു കരുതുന്നു.', 'യു.', 'എ.', 'ഇ-യിലെ നാല് ഔദ്യോഗിക ഭാഷകളിൽ ഒന്നു മലയാളമാണ്.', '[അവലംബം ആവശ്യമാണ്]'] + case 'nob': + assert sentence_segs == ['Bokmål er en av to offisielle målformer av norsk skriftspråk,', 'hvorav den andre er nynorsk.', 'I skrift har 87,', '3% bokmål som hovedmål i skolen.', '[1] Etter skriftreformene av riksmål i 1987 og bokmål i 1981 og 2005 er det lite som skiller bokmål og riksmål i alminnelig bruk.'] + case 'nno': + assert sentence_segs == ['Nynorsk,', 'før 1929 offisielt kalla landsmål,', 'er sidan jamstillingsvedtaket av 12.', 'mai 1885 ei av dei to offisielle målformene av norsk;', 'den andre forma er bokmål.', 'Nynorsk vert i dag nytta av om lag 10–15% av innbyggjarane i Noreg.', '[1][2] Skriftspråket er basert på nynorsk talemål,', 'det vil seie dei moderne norske dialektane til skilnad frå gamalnorsk og mellomnorsk.', 'Når ein seier at nokon snakkar nynorsk,', 'meiner ein helst at dei snakkar nynorsk normaltalemål.', 'Dei færraste dialekttalande nordmenn seier at dei snakkar nynorsk,', 'men det er ikkje uvanleg i kjerneområda til nynorsken.', 'Dette tilhøvet mellom tale og skrift ligg bak målrørsla sitt slagord sidan 1970-talet:', '«Snakk dialekt – skriv nynorsk!', '» Nynorske dialektar vart snakka over heile landet,', 'men det er berre på Vestlandet utanom dei største byene og i dei austlandske fjellbygdene at skriftspråket står sterkt.', 'Det vil seie at dei fleste dialekttalarane har bokmål som det primære skriftspråket sitt.'] + case 'pol': + assert sentence_segs == ['Język polski,', 'polszczyzna – język z grupy zachodniosłowiańskiej (do której należą również czeski,', 'kaszubski,', 'słowacki i języki łużyckie),', 'stanowiącej część rodziny indoeuropejskiej.', 'Jest językiem urzędowym w Polsce oraz należy do oficjalnych języków Unii Europejskiej.'] + case 'por_br' | 'por_pt': + assert sentence_segs == ['A língua portuguesa,', 'também designada português,', 'é uma língua indo-europeia românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.', 'Com a criação do Reino de Portugal em 1139 e a expansão para o sul na sequência da Reconquista,', 'deu-se a difusão da língua pelas terras conquistadas e mais tarde,', 'com as descobertas portuguesas,', 'para o Brasil,', 'África e outras partes do mundo.', '[8] O português foi usado,', 'naquela época,', 'não somente nas cidades conquistadas pelos portugueses,', 'mas também por muitos governantes locais nos seus contatos com outros estrangeiros poderosos.', 'Especialmente nessa altura a língua portuguesa também influenciou várias línguas.', '[9]'] + case 'rus': + assert sentence_segs == ['Ру́сский язы́к (МФА:', '[ˈruskʲɪi̯ jɪˈzɨk]ⓘ)[~ 3][⇨] — язык восточнославянской группы славянской ветви индоевропейской языковой семьи,', 'национальный язык русского народа.', 'Является одним из наиболее распространённых языков мира — восьмым среди всех языков мира по общей численности говорящих[5] и седьмым по численности владеющих им как родным (2022)[2].', 'Русский является также самым распространённым славянским языком[8] и самым распространённым языком в Европе — географически и по числу носителей языка как родного[6].'] + case 'slv': + assert sentence_segs == ['Slovenščina [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore,', 'ki jih govorijo ali so jih nekoč govorili Slovenci.', 'Govori ga okoli 2,', '5 (dva in pol) milijona govorcev po svetu,', 'od katerih jih večina živi v Sloveniji.', 'Glede na število govorcev ima razmeroma veliko narečij.', 'Slovenščina je zahodni južnoslovanski jezik in eden redkih indoevropskih jezikov,', 'ki je ohranil dvojino.', 'Za zapisovanje slovenskega jezika se danes uporablja gajica,', 'pisava imenovana po hrvaškem jezikoslovcu Ljudevitu Gaju,', 'ki jo je priredil po češkem črkopisu.', 'Slovenska gajica se imenuje slovenica.', 'Pišemo jo od marčne revolucije 1848.', 'Do takrat smo uporabljali bohoričico.'] + case 'spa': + assert sentence_segs == ['El español o castellano es una lengua romance procedente del latín hablado,', 'perteneciente a la familia de lenguas indoeuropeas.', 'Forma parte del grupo ibérico y es originaria de Castilla,', 'reino medieval de la península ibérica.', 'Se conoce también informalmente como castillan.', '1\u200b33\u200b34\u200b en algunas áreas rurales e indígenas de América,', '35\u200b pues el español se empezó a enseñar poco después de la incorporación de los nuevos territorios a la Corona de Castilla.', '36\u200b37\u200b38\u200b39\u200b40\u200b41\u200b'] + case 'swe': + assert sentence_segs == ['Svenska (svenska\u2009(info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk,', 'men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.', 'I övriga Finland talas det som modersmål framförallt i de finlandssvenska kustområdena i Österbotten,', 'Åboland och Nyland.', 'En liten minoritet svenskspråkiga finns även i Estland.', 'Svenska är nära besläktat och i hög grad ömsesidigt begripligt med danska och norska.', 'De andra nordiska språken,', 'isländska och färöiska,', 'är mindre ömsesidigt begripliga med svenska.', 'Liksom de övriga nordiska språken härstammar svenskan från en gren av fornnordiska,', 'vilket var det språk som talades av de germanska folken i Skandinavien.'] + case 'tha': + assert sentence_segs == ['ภาษาไทย หรือ ภาษาไทยกลาง เป็นภาษาในกลุ่มภาษาไท ซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า-ไท และเป็นภาษาราชการ และภาษาประจำชาติของประเทศไทย[3][4] มีการสันนิษฐานว่าภาษาในตระกูลนี้มีถิ่นกำเนิดจากทางตอนใต้ของประเทศจีน และนักภาษาศาสตร์บางส่วนเสนอว่า ภาษาไทยน่าจะมีความเชื่อมโยงกับตระกูลภาษาออสโตร-เอเชียติก ตระกูลภาษาออสโตรนีเซียน และตระกูลภาษาจีน-ทิเบต'] + case 'bod': + assert sentence_segs == ['བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་ཉེ་འཁོར་གྱི་ས་ཁུལ་བལ་ཡུལ།', 'འབྲུག་དང་འབྲས་ལྗོངས།', 'ལ་དྭགས་ནས་ལྷོ་མོན་རོང་སོགས་སུ་བེད་སྤྱོད་བྱེད་པའི་སྐད་ཡིག་དེ།', 'ད་ཆར་ཡོངས་གྲགས་སུ་བོད་ཀྱི་ཡུལ་གྲུ་སྟོད་སྨད་བར་གསུམ་ལ་ལྟོས་ཏེ་ནང་གསེས་རིགས་གསུམ་དུ་ཕྱེ་བ་སྟེ།', 'སྟོད་དབུས་གཙང་གི་སྐད་དང་།', 'བར་ཁམས་པའི་སྐད་དང་།', 'སྨད་ཨ་མདོའི་སྐད་རྣམས་སོ།', 'བོད་སྐད་ནི་ཧོར་སོག་ལ་སོགས་པ་གྲངས་ཉུང་མི་རིགས་གཞན་པ་ཁག་ཅིག་གིས་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད་པར་མ་ཟད།', 'བལ་ཡུལ་དང་།', 'འབྲས་ལྗོངས།', 'འབྲུག་ཡུལ་།', 'རྒྱ་གར་ཤར་དང་བྱང་རྒྱུད་མངའ་སྡེ་ཁག་གཅིག་བཅས་ཀྱི་རྒྱལ་ཁབ་རྣམས་སུའང་བེད་སྤྱོད་གཏོང་བཞིན་ཡོད།'] + case 'tur': + assert sentence_segs == ['Türkçe ya da Türk dili,', "Güneydoğu Avrupa ve Batı Asya'da konuşulan,", 'Türk dilleri dil ailesine ait sondan eklemeli bir dil.', '[12] Türk dilleri ailesinin Oğuz dilleri grubundan bir Batı Oğuz dili olan Osmanlı Türkçesinin devamını oluşturur.', 'Dil,', 'başta Türkiye olmak üzere Balkanlar,', 'Ege Adaları,', "Kıbrıs ve Orta Doğu'yu kapsayan eski Osmanlı İmparatorluğu coğrafyasında konuşulur.", "[12] Ethnologue'a göre Türkçe,", 'yaklaşık 83 milyon konuşuru ile dünyada en çok konuşulan 16.', 'dildir.', '[13] Türkçe Türkiye,', "Kıbrıs Cumhuriyeti ve Kuzey Kıbrıs'ta ulusal resmî dil statüsüne sahiptir.", '[12]'] + case 'vie': + assert sentence_segs == ['Tiếng Việt,', 'cũng gọi là tiếng Việt Nam[9] hay Việt ngữ là ngôn ngữ của người Việt và là ngôn ngữ chính thức tại Việt Nam.', 'Đây là tiếng mẹ đẻ của khoảng 85% dân cư Việt Nam cùng với hơn 4 triệu người Việt kiều.', 'Tiếng Việt còn là ngôn ngữ thứ hai của các dân tộc thiểu số tại Việt Nam và là ngôn ngữ dân tộc thiểu số được công nhận tại Cộng hòa Séc.'] + case _: + raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) @pytest.mark.parametrize('lang', test_langs_local) def test_sentence_seg_split(lang): @@ -228,7 +231,7 @@ def test_sentence_seg_tokenize_tokens(lang): text = ''.join(getattr(wl_test_lang_examples, f'TEXT_{lang.upper()}')), lang = lang ) - sentence_segs = wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, tokens) + sentence_segs = wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, wl_texts.to_display_texts(tokens)) if lang not in ['tha']: assert len(sentence_segs) > 1 diff --git a/tests/tests_nlp/test_sentiment_analysis.py b/tests/tests_nlp/test_sentiment_analysis.py index 0e62c2ffb..9d95694aa 100644 --- a/tests/tests_nlp/test_sentiment_analysis.py +++ b/tests/tests_nlp/test_sentiment_analysis.py @@ -19,7 +19,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import wl_sentiment_analysis, wl_word_tokenization +from wordless.wl_nlp import wl_sentiment_analysis, wl_texts, wl_word_tokenization main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') @@ -37,20 +37,27 @@ def test_sentiment_analyze(lang, sentiment_analyzer): test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') + tokens = wl_word_tokenization.wl_word_tokenize_flat( + main, + text = test_sentence, + lang = lang + ) + + wl_test_sentiment_analyze_models(lang, sentiment_analyzer, test_sentence, tokens, '', check_results = False) + +def wl_test_sentiment_analyze_models(lang, sentiment_analyzer, test_sentence, tokens, results, check_results = True): # Untokenized - sentiment_scores = wl_sentiment_analysis.wl_sentiment_analyze( + sentiment_scores_untokenized = wl_sentiment_analysis.wl_sentiment_analyze( main, inputs = [test_sentence], lang = lang, sentiment_analyzer = sentiment_analyzer ) + print(f'{lang} / {sentiment_analyzer}:') + print(f'{sentiment_scores_untokenized}\n') + # Tokenized - tokens = wl_word_tokenization.wl_word_tokenize_flat( - main, - text = test_sentence, - lang = lang - ) sentiment_scores_tokenized = wl_sentiment_analysis.wl_sentiment_analyze( main, inputs = [tokens], @@ -58,28 +65,28 @@ def test_sentiment_analyze(lang, sentiment_analyzer): sentiment_analyzer = sentiment_analyzer ) - print(f'{lang} / {sentiment_analyzer}:') - print(f'{sentiment_scores}\n') + if check_results: + assert sentiment_scores_untokenized == results + assert sentiment_scores_tokenized == results # Check for empty results - assert sentiment_scores + assert sentiment_scores_untokenized assert sentiment_scores_tokenized - for sentiment_score in sentiment_scores + sentiment_scores_tokenized: + for sentiment_score in sentiment_scores_untokenized + sentiment_scores_tokenized: assert -1 <= sentiment_score <= 1 - # Tagged texts + # Tagged main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - sentiment_scores_tokenized_tagged = wl_sentiment_analysis.wl_sentiment_analyze( + sentiment_scores_tagged = wl_sentiment_analysis.wl_sentiment_analyze( main, - inputs = [[token + '_TEST' for token in tokens]], + inputs = [[wl_texts.Wl_Token(token, tag = '_TEST') for token in tokens]], lang = lang, - sentiment_analyzer = sentiment_analyzer, - tagged = True + sentiment_analyzer = sentiment_analyzer ) - assert sentiment_scores_tokenized_tagged == sentiment_scores_tokenized + assert sentiment_scores_tagged == sentiment_scores_tokenized if __name__ == '__main__': for lang, sentiment_analyzer in test_sentiment_analyzers: diff --git a/tests/tests_nlp/test_syl_tokenization.py b/tests/tests_nlp/test_syl_tokenization.py index 4c87eef90..534933dee 100644 --- a/tests/tests_nlp/test_syl_tokenization.py +++ b/tests/tests_nlp/test_syl_tokenization.py @@ -19,8 +19,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_checks import wl_checks_tokens -from wordless.wl_nlp import wl_syl_tokenization, wl_word_tokenization +from wordless.wl_nlp import wl_syl_tokenization, wl_texts, wl_word_tokenization main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') @@ -32,13 +31,16 @@ @pytest.mark.parametrize('lang, syl_tokenizer', test_syl_tokenizers) def test_syl_tokenize(lang, syl_tokenizer): + tests_lang_util_skipped = False + # Untokenized - syls = wl_syl_tokenization.wl_syl_tokenize( + tokens_untokenized = wl_syl_tokenization.wl_syl_tokenize( main, inputs = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), lang = lang, syl_tokenizer = syl_tokenizer ) + syls_tokens = [token.syls for token in tokens_untokenized] # Tokenized tokens = wl_word_tokenization.wl_word_tokenize_flat( @@ -46,206 +48,159 @@ def test_syl_tokenize(lang, syl_tokenizer): text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), lang = lang ) - syls_tokenized = wl_syl_tokenization.wl_syl_tokenize( + tokens_tokenized = wl_syl_tokenization.wl_syl_tokenize( main, inputs = tokens, lang = lang, syl_tokenizer = syl_tokenizer ) + syls_tokens_tokenized = [token.syls for token in tokens_tokenized] print(f'{lang} / {syl_tokenizer}:') - print(f'{syls}\n') + print(f'{syls_tokens}\n') # Check for empty syllables - assert all(all(syls_token) for syls_token in syls) - assert all(all(syls_token) for syls_token in syls_tokenized) + assert all(all(syls_token) for syls_token in syls_tokens) + assert all(all(syls_token) for syls_token in syls_tokens_tokenized) # The count of syllables should be more than the count of tokens - assert sum((len(syls_token) for syls_token in syls)) > len(tokens) - assert sum((len(syls_token) for syls_token in syls_tokenized)) > len(tokens) + assert sum((len(syls_token) for syls_token in syls_tokens)) > len(tokens) + assert sum((len(syls_token) for syls_token in syls_tokens_tokenized)) > len(tokens) # Tokenization should not be modified - assert len(syls_tokenized) == len(tokens) + assert len(syls_tokens_tokenized) == len(tokens) - # Tagged texts + # Tagged main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - syls_tokenized_tagged = wl_syl_tokenization.wl_syl_tokenize( - main, - inputs = [token + '_TEST' for token in tokens], - lang = lang, - syl_tokenizer = syl_tokenizer, - tagged = True - ) - - for syls_tokens in syls_tokenized: - syls_tokens[-1] += '_TEST' - - assert syls_tokenized_tagged == syls_tokenized - - # Long texts - syls_tokenized_long = wl_syl_tokenization.wl_syl_tokenize( + tokens_tagged = wl_syl_tokenization.wl_syl_tokenize( main, - inputs = [str(i) for i in range(101) for j in range(10)], + inputs = [wl_texts.Wl_Token(token, tag = '_TEST') for token in tokens], lang = lang, syl_tokenizer = syl_tokenizer ) + syls_tokens_tagged = [token.syls for token in tokens_tagged] - assert syls_tokenized_long == [[str(i)] for i in range(101) for j in range(10)] + assert syls_tokens_tagged == syls_tokens_tokenized - tests_lang_util_skipped = False - - if lang == 'afr': - assert syls == [['Afri', 'kaans'], ['is'], ['ti', 'po', 'lo', 'gies'], ['be', 'skou'], ["'n"], ['In', 'do', 'Eu', 'ro', 'pe', 'se'], [','], ['Wes', 'Ger', 'maan', 'se'], [','], ['Ne', 'derfran', 'kie', 'se'], ['taal'], [','], ['['], ['2'], [']'], ['wat'], ['aan'], ['die'], ['suid', 'punt'], ['van'], ['Afri', 'ka'], ['on', 'der'], ['in', 'vloed'], ['van'], ['ver', 'skeie'], ['an', 'der'], ['ta', 'le'], ['en'], ['taal', 'groe', 'pe'], ['ont', 'staan'], ['het'], ['.']] - elif lang == 'sqi': - assert syls == [['Gju', 'ha'], ['shqi', 'pe'], ['('], ['ose'], ['thjesht'], ['shqi', 'p', 'ja'], [')'], ['ësh', 'të'], ['gju', 'hë'], ['dhe'], ['de', 'gë'], ['e'], ['ve', 'ça', 'n', 'të'], ['e'], ['fa', 'mi', 'l', 'jes'], ['in', 'do', 'e', 'v', 'ro', 'pi', 'ane'], ['që'], ['fli', 'tet'], ['nga'], ['rreth'], ['7', '10'], ['mi', 'li', 'onë'], ['nje', 'rëz'], ['në'], ['bo', 'të'], [','], ['['], ['1'], [']'], ['kry', 'esisht'], ['në'], ['Shqi', 'pë', 'ri'], [','], ['Ko', 'so', 'vë'], ['dhe'], ['Ma', 'qe', 'do', 'ni', 'në'], ['e'], ['Ve', 'ri', 'ut'], [','], ['por'], ['edhe'], ['në'], ['zo', 'na'], ['të'], ['tje', 'ra'], ['të'], ['Ev', 'ro', 'pës'], ['Ju', 'g', 'li', 'n', 'do', 're'], ['ku'], ['ka'], ['një'], ['po', 'pu', 'll', 'si'], ['shqi', 'p', 'ta', 're'], [','], ['du', 'ke'], ['pë', 'r', 'f', 'shi', 'rë'], ['Ma', 'lin'], ['e'], ['Zi'], ['dhe'], ['Lu', 'gi', 'nën'], ['e'], ['Pre', 'she', 'vës'], ['.']] - elif lang == 'bel': - assert syls == [['Бе', 'ла', 'ру́с', 'кая'], ['мо́', 'ва'], ['—'], ['на', 'цы', 'я', 'на', 'ль', 'ная'], ['мо', 'ва'], ['бе', 'ла', 'ру', 'саў'], [','], ['ува', 'хо', 'дзіць'], ['у'], ['ін', 'да', 'еў', 'ра', 'пей', 'с', 'кую'], ['моў', 'ную'], ['сям'], ["'"], ['ю'], [','], ['сла', 'вя', 'н', 'с', 'кую'], ['гру', 'пу'], [','], ['ус', 'хо', 'д', 'не', 'с', 'ла', 'вя', 'н', 'с', 'кую'], ['па', 'д', 'г', 'ру', 'пу'], ['.']] - elif lang == 'bul': - assert syls == [['Бъ', '̀л', 'гар', 'с', 'ки', 'ят'], ['ез', 'ѝк'], ['е'], ['ин', 'до', 'ев', 'ро', 'пейс', 'ки'], ['език'], ['от'], ['гру', 'па', 'та'], ['на'], ['юж', 'нос', 'ла', 'вян', 'с', 'ки', 'те'], ['ези', 'ци'], [','], ['ка', 'то'], ['об', 'ра', 'зу', 'ва'], ['не', 'го', 'ва', 'та'], ['из', 'точ', 'на'], ['под', 'г', 'ру', 'па'], ['.']] - elif lang == 'cat': - assert syls == [['El'], ['ca', 'ta', 'là'], ['('], ['de', 'no', 'mi', 'na', 'ció'], ['ofi', 'ci', 'al'], ['a'], ['Ca', 'ta', 'lu', 'nya'], [','], ['a'], ['les'], ['Illes'], ['Ba', 'le', 'ars'], [','], ['a'], ['An', 'dor', 'ra'], [','], ['a'], ['la'], ['ciu', 'tat'], ['de'], ["l'", 'Al', 'guer'], ['i'], ['tra', 'di', 'ci', 'o', 'nal'], ['a'], ['Ca', 'ta', 'lu', 'nya'], ['del'], ['Nord'], [')'], ['o'], ['va', 'len', 'cià'], ['('], ['de', 'no', 'mi', 'na', 'ció'], ['ofi', 'ci', 'al'], ['al'], ['Pa', 'ís'], ['Va', 'len', 'cià'], ['i'], ['tra', 'di', 'ci', 'o', 'nal'], ['al'], ['Car', 'xe'], [')'], ['és'], ['una'], ['llen', 'gua'], ['ro', 'mà', 'ni', 'ca'], ['par', 'la', 'da'], ['a'], ['Ca', 'ta', 'lu', 'nya'], [','], ['el'], ['Pa', 'ís'], ['Va', 'len', 'cià'], ['('], ['tret'], ["d'", 'al', 'gu', 'nes'], ['co', 'mar', 'ques'], ['i'], ['lo', 'ca', 'li', 'tats'], ['de'], ["l'", 'in', 'te', 'ri', 'or'], [')'], [','], ['les'], ['Illes'], ['Ba', 'le', 'ars'], ['('], ['on'], ['tam', 'bé'], ['rep'], ['el'], ['nom'], ['de'], ['ma', 'llor', 'quí'], [','], ['me', 'nor', 'quí'], [','], ['ei', 'vis', 'senc'], ['o'], ['for', 'men', 'te', 'rer'], ['se', 'gons'], ["l'", 'i', 'lla'], [')'], [','], ['An', 'dor', 'ra'], [','], ['la'], ['Fran', 'ja'], ['de'], ['Po', 'nent'], ['('], ['a'], ["l'", 'A', 'ra', 'gó'], [')'], [','], ['la'], ['ciu', 'tat'], ['de'], ["l'", 'Al', 'guer'], ['('], ['a'], ["l'", 'i', 'lla'], ['de'], ['Sar', 'de', 'nya'], [')'], [','], ['la'], ['Ca', 'ta', 'lu', 'nya'], ['del'], ['Nord'], [','], ['['], ['8'], [']'], ['el'], ['Car', 'xe'], ['('], ['un'], ['pe', 'tit'], ['ter', 'ri', 'to', 'ri'], ['de'], ['Múr', 'cia'], ['ha', 'bi', 'tat'], ['per'], ['po', 'bla', 'dors'], ['va', 'len', 'ci', 'ans'], [')'], [','], ['['], ['9'], [']'], ['['], ['10'], [']'], ['i'], ['en'], ['co', 'mu', 'ni', 'tats'], ['ar', 'reu'], ['del'], ['món'], ['('], ['en', 'tre'], ['les'], ['quals'], ['des', 'ta', 'ca'], ['la'], ['de'], ["l'", 'Ar', 'gen', 'ti', 'na'], [','], ['amb'], ['200.000'], ['par', 'lants'], [')'], ['.'], ['['], ['11'], [']']] - elif lang == 'hrv': - assert syls == [['Hr', 'vat', 'ski'], ['je', 'zik'], ['('], ['ISO'], ['639', '3'], [':'], ['hrv'], [')'], ['skup', 'ni'], ['je'], ['na', 'ziv'], ['za'], ['na', 'ci', 'onal', 'ni'], ['stan', 'dard', 'ni'], ['je', 'zik'], ['Hr', 'va', 'ta'], [','], ['te'], ['za'], ['skup'], ['na', 'rje', 'čja'], ['i'], ['go', 'vo', 'ra'], ['ko', 'ji', 'ma'], ['go', 'vo', 're'], ['ili'], ['su'], ['ne', 'ka', 'da'], ['go', 'vo', 'ri', 'li'], ['Hr', 'va', 'ti'], ['.']] - elif lang == 'ces': - assert syls == [['Češ', 'ti', 'na'], ['ne', 'bo', 'li'], ['čes', 'ký'], ['ja', 'zyk'], ['je'], ['zá', 'pa', 'doslo', 'van', 'ský'], ['ja', 'zyk'], [','], ['nej', 'bliž', 'ší'], ['slo', 'ven', 'šti', 'ně'], [','], ['po', 'té'], ['lužic', 'ké'], ['srbšti', 'ně'], ['a'], ['pol', 'šti', 'ně'], ['.']] - elif lang == 'dan': - assert syls == [['Dansk'], ['er'], ['et'], ['øst', 'n', 'or', 'disk'], ['sprog'], ['in', 'den', 'for'], ['den'], ['ger', 'man', 'ske'], ['gren'], ['af'], ['den'], ['in', 'do', 'eu', 'ro', 'pæ', 'i', 'ske'], ['sprog', 'fa', 'mi', 'lie'], ['.']] - elif lang == 'nld': - assert syls == [['Het'], ['Ne', 'der', 'lands'], ['is'], ['een'], ['Wes', 't', 'Ger', 'maan', 'se'], ['taal'], [','], ['de'], ['meest'], ['ge', 'bruik', 'te'], ['taal'], ['in'], ['Ne', 'der', 'land'], ['en'], ['Bel', 'gië'], [','], ['de'], ['of', 'fi', 'ci', 'ë', 'le'], ['taal'], ['van'], ['Su', 'ri', 'na', 'me'], ['en'], ['een'], ['van'], ['de'], ['drie'], ['of', 'fi', 'ci', 'ë', 'le'], ['ta', 'len'], ['van'], ['Bel', 'gië'], ['.']] - elif lang.startswith('eng_'): - if syl_tokenizer == 'nltk_legality': - assert syls == [['En', 'glish'], ['is'], ['a'], ['West'], ['Ger', 'ma', 'nic'], ['lan', 'gu', 'a', 'ge'], ['in'], ['the'], ['In', 'do-', 'E', 'u', 'rop', 'ean'], ['lan', 'gu', 'a', 'ge'], ['fa', 'mi', 'ly'], ['.']] - elif syl_tokenizer == 'nltk_sonority_sequencing': - assert syls == [['English'], ['is'], ['a'], ['West'], ['Ger', 'ma', 'nic'], ['lan', 'gua', 'ge'], ['in'], ['the'], ['Indo', '-', 'Eu', 'ro', 'pean'], ['lan', 'gua', 'ge'], ['fa', 'mi', 'ly'], ['.']] - elif syl_tokenizer == 'pyphen_eng_gb': - assert syls == [['Eng', 'lish'], ['is'], ['a'], ['West'], ['Ger', 'man', 'ic'], ['lan', 'guage'], ['in'], ['the'], ['In', 'do', 'European'], ['lan', 'guage'], ['fam', 'ily'], ['.']] - elif syl_tokenizer == 'pyphen_eng_us': - assert syls == [['Eng', 'lish'], ['is'], ['a'], ['West'], ['Ger', 'man', 'ic'], ['lan', 'guage'], ['in'], ['the'], ['In', 'do', 'Eu', 'ro', 'pean'], ['lan', 'guage'], ['fam', 'i', 'ly'], ['.']] - else: - tests_lang_util_skipped = True - elif lang == 'epo': - assert syls == [['Es', 'pe', 'r', 'anto'], [','], ['ori', 'gi', 'ne'], ['la'], ['Lin', 'g', 'vo'], ['In', 'ter', 'na', 'cia'], [','], ['['], ['4'], [']'], ['es', 'tas'], ['la'], ['plej'], ['dis', 'vas', 't', 'iĝ', 'inta'], ['in', 'ter', 'na', 'cia'], ['plan', 'lin', 'g', 'vo.'], ['['], ['5'], [']']] - elif lang == 'est': - assert syls == [['Ees', 'ti'], ['kee', 'lel'], ['on'], ['kaks'], ['suu', 're', 'mat'], ['mur', 'de', 'rüh', 'ma'], ['('], ['põh', 'ja', 'ees', 'ti'], ['ja'], ['lõu', 'na', 'ees', 'ti'], [')'], [','], ['mõ', 'nes'], ['kä', 'sit', 'luses'], ['eris', 'ta', 'tak', 'se'], ['ka'], ['kir', 'de', 'ran', 'ni', 'ku'], ['mur', 'de', 'id'], ['eral', 'di'], ['mur', 'de', 'rüh', 'ma', 'na'], ['.']] - elif lang == 'fra': - assert syls == [['Le'], ['fran', 'çais'], ['est'], ['une'], ['langue'], ['in', 'do', 'eu', 'ro', 'péenne'], ['de'], ['la'], ['fa', 'mille'], ['des'], ['langues'], ['ro', 'manes'], ['dont'], ['les'], ['lo', 'cu', 'teurs'], ['sont'], ['ap', 'pe', 'lés'], ['fran', 'co', 'phones'], ['.']] - elif lang == 'glg': - assert syls == [['O'], ['ga', 'le', 'go'], ['('], ['['], ['ɡaˈleɣo̝'], [']'], ['['], ['1'], [']'], [')'], ['é'], ['unha'], ['lin', 'gua'], ['in', 'do', 'eu', 'ro', 'pea'], ['que'], ['per', 'ten', 'ce'], ['á'], ['póla'], ['de'], ['lin', 'guas'], ['ro', 'má', 'ni', 'cas'], ['.']] - elif lang.startswith('deu_'): - assert syls == [['Das'], ['Deut', 'sche'], ['ist'], ['ei', 'ne'], ['plu', 'ri', 'zen', 'tri', 'sche'], ['Spra', 'che'], [','], ['ent', 'hält'], ['al', 'so'], ['meh', 're', 're'], ['Stan', 'dard', 'va', 'ri', 'e', 'tä', 'ten'], ['in'], ['ver', 'schie', 'de', 'nen'], ['Re', 'gi', 'o', 'nen'], ['.']] - elif lang == 'ell': - assert syls == [['Η'], ['ελ', 'λη', 'νι', 'κή'], ['γλώσ', 'σα'], ['ανή', 'κει'], ['στην'], ['ιν', 'δο', 'ευ', 'ρω', 'παϊκή'], ['οι', 'κο', 'γένεια'], ['['], ['9'], [']'], ['και'], ['απο', 'τε', 'λεί'], ['το'], ['μο', 'να', 'δι', 'κό'], ['μέλος'], ['του'], ['ελ', 'λη', 'νι', 'κού'], ['κλάδου'], [','], ['ενώ'], ['εί', 'ναι'], ['η'], ['επί', 'ση', 'μη'], ['γλώσ', 'σα'], ['της'], ['Ελ', 'λάδας'], ['και'], ['της'], ['Κύ', 'πρου'], ['.']] - elif lang == 'hun': - assert syls == [['A'], ['ma', 'gyar'], ['nyelv'], ['az'], ['urá', 'li'], ['nyelv', 'csa', 'lád'], ['tag', 'ja'], [','], ['a'], ['finn', 'ugor'], ['nyel', 'vek'], ['kö', 'zé'], ['tar', 'to', 'zó'], ['ugor'], ['nyel', 'vek'], ['egyi', 'ke'], ['.']] - elif lang == 'isl': - assert syls == [['Ís', 'lenska'], ['er'], ['vest', 'ur', 'nor', 'rænt'], [','], ['germ', 'anskt'], ['og'], ['indó', 'evr', 'ópskt'], ['tungu', 'mál'], ['sem'], ['er'], ['eink', 'um'], ['tal', 'að'], ['og'], ['rit', 'að'], ['á'], ['Ís', 'landi'], ['og'], ['er'], ['móð', 'ur', 'mál'], ['lang', 'flestra'], ['Ís', 'lend', 'inga.'], ['['], ['5'], [']']] - elif lang == 'ind': - assert syls == [['Ba', 'ha', 'sa'], ['In', 'do', 'ne', 'sia'], ['ada', 'lah'], ['ba', 'ha', 'sa'], ['na', 'si', 'o', 'nal'], ['dan'], ['res', 'mi'], ['di'], ['se', 'lu', 'r', 'uh'], ['wi', 'la', 'yah'], ['In', 'do', 'ne', 'sia'], ['.']] - elif lang == 'ita': - assert syls == [["L'i", 'ta', 'lia', 'no'], ['('], ['['], ['itaˈ', 'l', 'jaː', 'no'], [']'], ['['], ['No', 'ta'], ['1'], [']'], ['ascol', 'taⓘ'], [')'], ['è'], ['una'], ['lin', 'gua'], ['ro', 'man', 'za'], ['par', 'la', 'ta'], ['prin', 'ci', 'pal', 'men', 'te'], ['in'], ['Ita', 'lia'], ['.']] - elif lang == 'lit': - assert syls == [['Lie', 'tu', 'vių'], ['kal', 'ba'], ['–'], ['iš'], ['bal', 'tų'], ['pro', 'kal', 'bės'], ['ki', 'lu', 'si'], ['lie', 'tu', 'vių'], ['tau', 'tos'], ['kal', 'ba'], [','], ['ku', 'ri'], ['Lie', 'tu', 'vo', 'je'], ['yra'], ['vals', 'ty', 'bi', 'nė'], [','], ['o'], ['Eu', 'ro', 'pos'], ['Są', 'jun', 'go', 'je'], ['–'], ['vie', 'na'], ['iš'], ['ofi', 'cia', 'lių', 'jų'], ['kal', 'bų'], ['.']] - elif lang == 'lav': - assert syls == [['Lat', 'vie', 'šu'], ['va', 'lo', 'da'], ['ir'], ['dzim', 'tā'], ['va', 'lo', 'da'], ['ap', 'mē', 'ram'], ['1,5'], ['mil', 'jo', 'niem'], ['cil', 'vē', 'ku'], [','], ['gal', 've', 'no', 'kārt'], ['Lat', 'vi', 'jā'], [','], ['kur'], ['tā'], ['ir'], ['vien', 'ī', 'gā'], ['valsts'], ['va', 'lo', 'da.'], ['['], ['1'], [']'], ['['], ['3'], [']']] - elif lang == 'mon': - assert syls == [['Мон', 'гол'], ['хэл'], ['нь'], ['Мон', 'гол'], ['ул', 'сын'], ['ал', 'бан'], ['ёс', 'ны'], ['хэл'], ['юм'], ['.']] - elif lang == 'nob': - assert syls == [['Bok', 'mål'], ['er'], ['en'], ['av'], ['to'], ['of', 'fi', 'si', 'el', 'le'], ['mål', 'for', 'mer'], ['av'], ['norsk'], ['skrift', 'språk'], [','], ['hvor', 'av'], ['den'], ['and', 're'], ['er'], ['ny', 'norsk'], ['.']] - elif lang == 'nno': - assert syls == [['Ny', 'norsk'], [','], ['før'], ['1929'], ['of', 'fi', 'si', 'elt'], ['kal', 'la'], ['lands', 'mål'], [','], ['er'], ['si', 'dan'], ['jam', 'stil', 'lings', 'ved', 'ta', 'ket'], ['av'], ['12'], ['.'], ['mai'], ['1885'], ['ei'], ['av'], ['dei'], ['to'], ['of', 'fi', 'si', 'el', 'le'], ['mål', 'for', 'me', 'ne'], ['av'], ['norsk'], [';'], ['den'], ['and', 're'], ['for', 'ma'], ['er'], ['bok', 'mål'], ['.']] - elif lang == 'pol': - assert syls == [['Ję', 'zyk'], ['pol', 'ski'], [','], ['pol', 'sz', 'czy', 'zna'], ['–'], ['ję', 'zyk'], ['z'], ['gru', 'py'], ['za', 'chod', 'nio', 'sło', 'wiań', 'skiej'], ['('], ['do'], ['któ', 'rej'], ['na', 'le', 'żą'], ['rów', 'nież'], ['cze', 'ski'], [','], ['ka', 'szub', 'ski'], [','], ['sło', 'wac', 'ki'], ['i'], ['ję', 'zy', 'ki'], ['łu', 'życ', 'kie'], [')'], [','], ['sta', 'no', 'wią', 'cej'], ['część'], ['ro', 'dzi', 'ny'], ['in', 'do', 'eu', 'ro', 'pej', 'skiej'], ['.']] - elif lang.startswith('por_'): - assert syls == [['A'], ['lín', 'gua'], ['por', 'tu', 'gue', 'sa'], [','], ['tam', 'bém'], ['de', 'sig', 'na', 'da'], ['por', 'tu', 'guês'], [','], ['é'], ['uma'], ['lín', 'gua'], ['in', 'do', 'eu', 'ro', 'peia'], ['ro', 'mâ', 'ni', 'ca'], ['fle', 'xi', 'va'], ['oci', 'den', 'tal'], ['ori', 'gi', 'na', 'da'], ['no'], ['ga', 'le', 'go', 'por', 'tu', 'guês'], ['fa', 'la', 'do'], ['no'], ['Rei', 'no'], ['da'], ['Ga', 'li', 'za'], ['e'], ['no'], ['nor', 'te'], ['de'], ['Por', 'tu', 'gal'], ['.']] - elif lang == 'ron': - assert syls == [['Lim', 'ba'], ['ro', 'mâ', 'nă'], ['es', 'te'], ['o'], ['lim', 'bă'], ['in', 'do', 'e', 'u', 'ro', 'pe', 'a', 'nă'], ['din'], ['gru', 'pul'], ['ita', 'lic'], ['și'], ['din'], ['sub', 'gru', 'pul'], ['orien', 'tal'], ['al'], ['lim', 'bi', 'lor'], ['ro', 'ma', 'ni', 'ce'], ['.']] - elif lang == 'rus': - assert syls == [['Ру́с', 'ский'], ['язы́к'], ['('], ['МФА'], [':'], ['['], ['ˈruskʲɪi̯'], ['jɪˈzɨk'], [']'], ['ⓘ'], [')'], ['['], ['~'], ['3'], [']'], ['['], ['⇨'], [']'], ['—'], ['язык'], ['вос', 'точ', 'но', 'сла', 'вян', 'ской'], ['груп', 'пы'], ['сла', 'вян', 'ской'], ['вет', 'ви'], ['ин', 'до', 'ев', 'ро', 'пей', 'ской'], ['язы', 'ко', 'вой'], ['се', 'мьи'], [','], ['на', 'ци', 'о', 'наль', 'ный'], ['язык'], ['рус', 'ско', 'го'], ['на', 'ро', 'да'], ['.']] - elif lang == 'srp_cyrl': - assert syls == [['Срп', 'ски'], ['је', 'зик'], ['је'], ['зва', 'ни', 'чан'], ['у'], ['Ср', 'би', 'ји'], [','], ['Бо', 'сни'], ['и'], ['Хер', 'це', 'го', 'ви', 'ни'], ['и'], ['Цр', 'ној'], ['Го', 'ри'], ['и'], ['го', 'во', 'ри'], ['га'], ['око'], ['12'], ['ми', 'ли', 'о', 'на'], ['љу', 'ди.'], ['['], ['13'], [']']] - elif lang == 'srp_latn': - assert syls == [['Srp', 'ski'], ['je', 'zik'], ['je'], ['zva', 'ni', 'čan'], ['u'], ['Sr', 'bi', 'ji'], [','], ['Bo', 'sni'], ['i'], ['Her', 'ce', 'go', 'vi', 'ni'], ['i'], ['Cr', 'noj'], ['Go', 'ri'], ['i'], ['go', 'vo', 'ri'], ['ga'], ['oko'], ['12'], ['mi', 'li', 'o', 'na'], ['lju', 'di.'], ['['], ['13'], [']']] - elif lang == 'slk': - assert syls == [['Slo', 'ven', 'či', 'na'], ['je'], ['ofi', 'ciál', 'ne'], ['úrad', 'ným'], ['ja', 'zy', 'kom'], ['Slo', 'ven', 'ska'], [','], ['Voj', 'vo', 'di', 'ny'], ['a'], ['od'], ['1'], ['.'], ['má', 'ja'], ['2004'], ['jed', 'ným'], ['z'], ['ja', 'zy', 'kov'], ['Európ', 'skej'], ['únie'], ['.']] - elif lang == 'slv': - assert syls == [['Slo', 'ven', 'šči', 'na'], ['['], ['slo', 'ˈʋe', 'nʃtʃi', 'na'], [']'], ['je'], ['zdru', 'že', 'ni'], ['na', 'ziv'], ['za'], ['ura', 'dni'], ['knji', 'žni'], ['je', 'zik'], ['Slo', 'ven', 'cev'], ['in'], ['sku', 'pno'], ['ime'], ['za'], ['na', 're', 'čja'], ['in'], ['go', 'vo', 're'], [','], ['ki'], ['jih'], ['go', 'vo', 'ri', 'jo'], ['ali'], ['so'], ['jih'], ['ne', 'koč'], ['go', 'vo', 'ri', 'li'], ['Slo', 'ven', 'ci'], ['.']] - elif lang == 'spa': - assert syls == [['El'], ['es', 'pa', 'ñol'], ['o'], ['cas', 'te', 'llano'], ['es'], ['una'], ['len', 'gua'], ['ro', 'man', 'ce'], ['pro', 'ce', 'den', 'te'], ['del'], ['la', 'tín'], ['ha', 'bla', 'do'], [','], ['per', 'te', 'ne', 'cien', 'te'], ['a'], ['la'], ['fa', 'mi', 'lia'], ['de'], ['len', 'guas'], ['in', 'doeu', 'ro', 'peas'], ['.']] - elif lang == 'swe': - assert syls == [['Svens', 'ka'], ['('], ['svens', 'ka'], ['('], ['in', 'fo'], [')'], [')'], ['är'], ['ett'], ['öst', 'nor', 'diskt'], ['språk'], ['som'], ['ta', 'las'], ['av'], ['un', 'ge', 'fär'], ['tio'], ['mil', 'jo', 'ner'], ['per', 'so', 'ner'], ['främst'], ['i'], ['Sve', 'ri', 'ge'], ['där'], ['språ', 'ket'], ['har'], ['en'], ['do', 'mi', 'nant'], ['ställ', 'ning'], ['som'], ['hu', 'vud', 'språk'], [','], ['men'], ['även'], ['som'], ['det'], ['ena'], ['na', 'tio', 'nal', 'språ', 'ket'], ['i'], ['Fin', 'land'], ['och'], ['som'], ['en', 'da'], ['of', 'fi', 'ci', 'el', 'la'], ['språk'], ['på'], ['Åland'], ['.']] - elif lang == 'tel': - assert syls == [['తె', 'లు', 'గు'], ['అనే', 'ది'], ['ద్రా', 'విడ'], ['భా', 'షల'], ['కు', 'టుం', 'బా', 'ని', 'కి'], ['చెం', 'దిన'], ['భాష'], ['.']] - elif lang == 'tha': - assert syls == [['ภา', 'ษา', 'ไทย'], ['หรือ'], ['ภา', 'ษา', 'ไทย'], ['กลาง'], ['เป็น'], ['ภา', 'ษา'], ['ใน'], ['กลุ่ม'], ['ภา', 'ษา'], ['ไท'], ['ซึ่ง'], ['เป็น'], ['กลุ่ม', 'ย่อย'], ['ของ'], ['ตระ', 'กูล'], ['ภา', 'ษา'], ['ข'], ['ร้า'], ['-'], ['ไท'], ['และ'], ['เป็น'], ['ภา', 'ษา', 'ราช', 'การ'], ['และ'], ['ภา', 'ษา', 'ประ', 'จำ', 'ชาติ'], ['ของ'], ['ประ', 'เทศ'], ['ไทย'], ['['], ['3'], [']['], ['4'], [']']] - elif lang == 'ukr': - assert syls == [['Укра', 'ї', '́', 'н', 'сь', 'ка'], ['мо', '́', 'ва'], ['('], ['МФА'], [':'], ['['], ['ukrɑ̽ˈjɪnʲsʲkɑ̽'], ['ˈmɔwɑ̽'], [']'], [','], ['іс', 'то', 'ри', 'ч', 'ні'], ['на', 'зви'], ['—'], ['ру', '́', 'сь', 'ка'], ['['], ['10'], [']'], ['['], ['11'], [']'], ['['], ['12'], [']'], ['['], ['*'], ['1'], [']'], [')'], ['—'], ['на', 'ціо', 'на', 'ль', 'на'], ['мо', 'ва'], ['укра', 'ї', 'н', 'ців'], ['.']] - elif lang == 'zul': - assert syls == [['Zu', 'lu'], ['/ˈzu', 'ːlu', 'ː/'], [','], ['no', 'ma'], ['isi', 'Zu', 'lu'], ['wu', 'li', 'mi'], ['lwa', 'ba', 'ntu'], ['ba', 'se'], ['Ni', 'ngi', 'zi', 'mu'], ['neA', 'fri', 'ka'], ['aba', 'yi', 'ngxe', 'nye'], ['ya', 'ma', 'Ngu', 'ni'], ['.']] - else: - raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) - - if tests_lang_util_skipped: - raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(syl_tokenizer) - -@pytest.mark.parametrize('lang, syl_tokenizer', test_syl_tokenizers) -def test_syl_tokenize_tokens_no_punc(lang, syl_tokenizer): - tokens = wl_word_tokenization.wl_word_tokenize_flat( - main, - text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), - lang = lang - ) - syls_tokens = wl_syl_tokenization.wl_syl_tokenize_tokens_no_punc( + # Long + tokens_long = wl_syl_tokenization.wl_syl_tokenize( main, - tokens = tokens, + inputs = wl_texts.to_tokens(wl_test_lang_examples.TOKENS_LONG, lang = lang), lang = lang, syl_tokenizer = syl_tokenizer ) + syls_tokens_long = [token.syls for token in tokens_long] - # Tagged texts - syls_tokens_tagged = wl_syl_tokenization.wl_syl_tokenize_tokens_no_punc( - main, - tokens = [token + '_TEST' for token in tokens], - lang = lang, - syl_tokenizer = syl_tokenizer, - tagged = True - ) + assert syls_tokens_long == [(token,) for token in wl_test_lang_examples.TOKENS_LONG] - # Long texts - syls_tokens_long = wl_syl_tokenization.wl_syl_tokenize( + # Syllabified + syls_tokens_orig = [('te', 'st')] + tokens_syllabified = wl_syl_tokenization.wl_syl_tokenize( main, - inputs = [str(i) for i in range(101) for j in range(50)], + inputs = wl_texts.to_tokens(['test'], lang = lang, syls_tokens = syls_tokens_orig), lang = lang, syl_tokenizer = syl_tokenizer ) + syls_tokens_syllabified = [token.syls for token in tokens_syllabified] + + assert syls_tokens_syllabified == syls_tokens_orig + + match lang: + case 'afr': + assert syls_tokens == [('Afri', 'kaans'), ('is',), ('ti', 'po', 'lo', 'gies'), ('be', 'skou'), ("'n",), ('In', 'do', 'Eu', 'ro', 'pe', 'se'), (',',), ('Wes', 'Ger', 'maan', 'se'), (',',), ('Ne', 'derfran', 'kie', 'se'), ('taal',), (',',), ('[',), ('2',), (']',), ('wat',), ('aan',), ('die',), ('suid', 'punt'), ('van',), ('Afri', 'ka'), ('on', 'der'), ('in', 'vloed'), ('van',), ('ver', 'skeie'), ('an', 'der'), ('ta', 'le'), ('en',), ('taal', 'groe', 'pe'), ('ont', 'staan'), ('het',), ('.',)] + case 'sqi': + assert syls_tokens == [('Gju', 'ha'), ('shqi', 'pe'), ('(',), ('ose',), ('thjesht',), ('shqi', 'p', 'ja'), (')',), ('ësh', 'të'), ('gju', 'hë'), ('dhe',), ('de', 'gë'), ('e',), ('ve', 'ça', 'n', 'të'), ('e',), ('fa', 'mi', 'l', 'jes'), ('in', 'do', 'e', 'v', 'ro', 'pi', 'ane'), ('që',), ('fli', 'tet'), ('nga',), ('rreth',), ('7', '10'), ('mi', 'li', 'onë'), ('nje', 'rëz'), ('në',), ('bo', 'të'), (',',), ('[',), ('1',), (']',), ('kry', 'esisht'), ('në',), ('Shqi', 'pë', 'ri'), (',',), ('Ko', 'so', 'vë'), ('dhe',), ('Ma', 'qe', 'do', 'ni', 'në'), ('e',), ('Ve', 'ri', 'ut'), (',',), ('por',), ('edhe',), ('në',), ('zo', 'na'), ('të',), ('tje', 'ra'), ('të',), ('Ev', 'ro', 'pës'), ('Ju', 'g', 'li', 'n', 'do', 're'), ('ku',), ('ka',), ('një',), ('po', 'pu', 'll', 'si'), ('shqi', 'p', 'ta', 're'), (',',), ('du', 'ke'), ('pë', 'r', 'f', 'shi', 'rë'), ('Ma', 'lin'), ('e',), ('Zi',), ('dhe',), ('Lu', 'gi', 'nën'), ('e',), ('Pre', 'she', 'vës'), ('.',)] + case 'bel': + assert syls_tokens == [('Бе', 'ла', 'ру́с', 'кая'), ('мо́', 'ва'), ('—',), ('на', 'цы', 'я', 'на', 'ль', 'ная'), ('мо', 'ва'), ('бе', 'ла', 'ру', 'саў'), (',',), ('ува', 'хо', 'дзіць'), ('у',), ('ін', 'да', 'еў', 'ра', 'пей', 'с', 'кую'), ('моў', 'ную'), ('сям',), ("'",), ('ю',), (',',), ('сла', 'вя', 'н', 'с', 'кую'), ('гру', 'пу'), (',',), ('ус', 'хо', 'д', 'не', 'с', 'ла', 'вя', 'н', 'с', 'кую'), ('па', 'д', 'г', 'ру', 'пу'), ('.',)] + case 'bul': + assert syls_tokens == [('Бъ', '̀л', 'гар', 'с', 'ки', 'ят'), ('ез', 'ѝк'), ('е',), ('ин', 'до', 'ев', 'ро', 'пейс', 'ки'), ('език',), ('от',), ('гру', 'па', 'та'), ('на',), ('юж', 'нос', 'ла', 'вян', 'с', 'ки', 'те'), ('ези', 'ци'), (',',), ('ка', 'то'), ('об', 'ра', 'зу', 'ва'), ('не', 'го', 'ва', 'та'), ('из', 'точ', 'на'), ('под', 'г', 'ру', 'па'), ('.',)] + case 'cat': + assert syls_tokens == [('El',), ('ca', 'ta', 'là'), ('(',), ('de', 'no', 'mi', 'na', 'ció'), ('ofi', 'ci', 'al'), ('a',), ('Ca', 'ta', 'lu', 'nya'), (',',), ('a',), ('les',), ('Illes',), ('Ba', 'le', 'ars'), (',',), ('a',), ('An', 'dor', 'ra'), (',',), ('a',), ('la',), ('ciu', 'tat'), ('de',), ("l'", 'Al', 'guer'), ('i',), ('tra', 'di', 'ci', 'o', 'nal'), ('a',), ('Ca', 'ta', 'lu', 'nya'), ('del',), ('Nord',), (')',), ('o',), ('va', 'len', 'cià'), ('(',), ('de', 'no', 'mi', 'na', 'ció'), ('ofi', 'ci', 'al'), ('al',), ('Pa', 'ís'), ('Va', 'len', 'cià'), ('i',), ('tra', 'di', 'ci', 'o', 'nal'), ('al',), ('Car', 'xe'), (')',), ('és',), ('una',), ('llen', 'gua'), ('ro', 'mà', 'ni', 'ca'), ('par', 'la', 'da'), ('a',), ('Ca', 'ta', 'lu', 'nya'), (',',), ('el',), ('Pa', 'ís'), ('Va', 'len', 'cià'), ('(',), ('tret',), ("d'", 'al', 'gu', 'nes'), ('co', 'mar', 'ques'), ('i',), ('lo', 'ca', 'li', 'tats'), ('de',), ("l'", 'in', 'te', 'ri', 'or'), (')',), (',',), ('les',), ('Illes',), ('Ba', 'le', 'ars'), ('(',), ('on',), ('tam', 'bé'), ('rep',), ('el',), ('nom',), ('de',), ('ma', 'llor', 'quí'), (',',), ('me', 'nor', 'quí'), (',',), ('ei', 'vis', 'senc'), ('o',), ('for', 'men', 'te', 'rer'), ('se', 'gons'), ("l'", 'i', 'lla'), (')',), (',',), ('An', 'dor', 'ra'), (',',), ('la',), ('Fran', 'ja'), ('de',), ('Po', 'nent'), ('(',), ('a',), ("l'", 'A', 'ra', 'gó'), (')',), (',',), ('la',), ('ciu', 'tat'), ('de',), ("l'", 'Al', 'guer'), ('(',), ('a',), ("l'", 'i', 'lla'), ('de',), ('Sar', 'de', 'nya'), (')',), (',',), ('la',), ('Ca', 'ta', 'lu', 'nya'), ('del',), ('Nord',), (',',), ('[',), ('8',), (']',), ('el',), ('Car', 'xe'), ('(',), ('un',), ('pe', 'tit'), ('ter', 'ri', 'to', 'ri'), ('de',), ('Múr', 'cia'), ('ha', 'bi', 'tat'), ('per',), ('po', 'bla', 'dors'), ('va', 'len', 'ci', 'ans'), (')',), (',',), ('[',), ('9',), (']',), ('[',), ('10',), (']',), ('i',), ('en',), ('co', 'mu', 'ni', 'tats'), ('ar', 'reu'), ('del',), ('món',), ('(',), ('en', 'tre'), ('les',), ('quals',), ('des', 'ta', 'ca'), ('la',), ('de',), ("l'", 'Ar', 'gen', 'ti', 'na'), (',',), ('amb',), ('200.000',), ('par', 'lants'), (')',), ('.',), ('[',), ('11',), (']',)] + case 'hrv': + assert syls_tokens == [('Hr', 'vat', 'ski'), ('je', 'zik'), ('(',), ('ISO',), ('639', '3'), (':',), ('hrv',), (')',), ('skup', 'ni'), ('je',), ('na', 'ziv'), ('za',), ('na', 'ci', 'onal', 'ni'), ('stan', 'dard', 'ni'), ('je', 'zik'), ('Hr', 'va', 'ta'), (',',), ('te',), ('za',), ('skup',), ('na', 'rje', 'čja'), ('i',), ('go', 'vo', 'ra'), ('ko', 'ji', 'ma'), ('go', 'vo', 're'), ('ili',), ('su',), ('ne', 'ka', 'da'), ('go', 'vo', 'ri', 'li'), ('Hr', 'va', 'ti'), ('.',)] + case 'ces': + assert syls_tokens == [('Češ', 'ti', 'na'), ('ne', 'bo', 'li'), ('čes', 'ký'), ('ja', 'zyk'), ('je',), ('zá', 'pa', 'doslo', 'van', 'ský'), ('ja', 'zyk'), (',',), ('nej', 'bliž', 'ší'), ('slo', 'ven', 'šti', 'ně'), (',',), ('po', 'té'), ('lužic', 'ké'), ('srbšti', 'ně'), ('a',), ('pol', 'šti', 'ně'), ('.',)] + case 'dan': + assert syls_tokens == [('Dansk',), ('er',), ('et',), ('øst', 'n', 'or', 'disk'), ('sprog',), ('in', 'den', 'for'), ('den',), ('ger', 'man', 'ske'), ('gren',), ('af',), ('den',), ('in', 'do', 'eu', 'ro', 'pæ', 'i', 'ske'), ('sprog', 'fa', 'mi', 'lie'), ('.',)] + case 'nld': + assert syls_tokens == [('Het',), ('Ne', 'der', 'lands'), ('is',), ('een',), ('Wes', 't', 'Ger', 'maan', 'se'), ('taal',), (',',), ('de',), ('meest',), ('ge', 'bruik', 'te'), ('taal',), ('in',), ('Ne', 'der', 'land'), ('en',), ('Bel', 'gië'), (',',), ('de',), ('of', 'fi', 'ci', 'ë', 'le'), ('taal',), ('van',), ('Su', 'ri', 'na', 'me'), ('en',), ('een',), ('van',), ('de',), ('drie',), ('of', 'fi', 'ci', 'ë', 'le'), ('ta', 'len'), ('van',), ('Bel', 'gië'), ('.',)] + case 'eng_gb' | 'eng_us': + match syl_tokenizer: + case 'nltk_legality': + assert syls_tokens == [('En', 'glish'), ('is',), ('a',), ('West',), ('Ger', 'ma', 'nic'), ('lan', 'gu', 'a', 'ge'), ('in',), ('the',), ('In', 'do-', 'E', 'u', 'rop', 'ean'), ('lan', 'gu', 'a', 'ge'), ('fa', 'mi', 'ly'), ('.',)] + case 'nltk_sonority_sequencing': + assert syls_tokens == [('English',), ('is',), ('a',), ('West',), ('Ger', 'ma', 'nic'), ('lan', 'gua', 'ge'), ('in',), ('the',), ('Indo', '-', 'Eu', 'ro', 'pean'), ('lan', 'gua', 'ge'), ('fa', 'mi', 'ly'), ('.',)] + case 'pyphen_eng_gb': + assert syls_tokens == [('Eng', 'lish'), ('is',), ('a',), ('West',), ('Ger', 'man', 'ic'), ('lan', 'guage'), ('in',), ('the',), ('In', 'do', 'European'), ('lan', 'guage'), ('fam', 'ily'), ('.',)] + case 'pyphen_eng_us': + assert syls_tokens == [('Eng', 'lish'), ('is',), ('a',), ('West',), ('Ger', 'man', 'ic'), ('lan', 'guage'), ('in',), ('the',), ('In', 'do', 'Eu', 'ro', 'pean'), ('lan', 'guage'), ('fam', 'i', 'ly'), ('.',)] + case _: + tests_lang_util_skipped = True + case 'epo': + assert syls_tokens == [('Es', 'pe', 'r', 'anto'), (',',), ('ori', 'gi', 'ne'), ('la',), ('Lin', 'g', 'vo'), ('In', 'ter', 'na', 'cia'), (',',), ('[',), ('4',), (']',), ('es', 'tas'), ('la',), ('plej',), ('dis', 'vas', 't', 'iĝ', 'inta'), ('in', 'ter', 'na', 'cia'), ('plan', 'lin', 'g', 'vo.'), ('[',), ('5',), (']',)] + case 'est': + assert syls_tokens == [('Ees', 'ti'), ('kee', 'lel'), ('on',), ('kaks',), ('suu', 're', 'mat'), ('mur', 'de', 'rüh', 'ma'), ('(',), ('põh', 'ja', 'ees', 'ti'), ('ja',), ('lõu', 'na', 'ees', 'ti'), (')',), (',',), ('mõ', 'nes'), ('kä', 'sit', 'luses'), ('eris', 'ta', 'tak', 'se'), ('ka',), ('kir', 'de', 'ran', 'ni', 'ku'), ('mur', 'de', 'id'), ('eral', 'di'), ('mur', 'de', 'rüh', 'ma', 'na'), ('.',)] + case 'fra': + assert syls_tokens == [('Le',), ('fran', 'çais'), ('est',), ('une',), ('langue',), ('in', 'do', 'eu', 'ro', 'péenne'), ('de',), ('la',), ('fa', 'mille'), ('des',), ('langues',), ('ro', 'manes'), ('dont',), ('les',), ('lo', 'cu', 'teurs'), ('sont',), ('ap', 'pe', 'lés'), ('fran', 'co', 'phones'), ('.',)] + case 'glg': + assert syls_tokens == [('O',), ('ga', 'le', 'go'), ('(',), ('[',), ('ɡaˈleɣo̝',), (']',), ('[',), ('1',), (']',), (')',), ('é',), ('unha',), ('lin', 'gua'), ('in', 'do', 'eu', 'ro', 'pea'), ('que',), ('per', 'ten', 'ce'), ('á',), ('póla',), ('de',), ('lin', 'guas'), ('ro', 'má', 'ni', 'cas'), ('.',)] + case 'deu_at' | 'deu_de' | 'deu_ch': + assert syls_tokens == [('Das',), ('Deut', 'sche'), ('ist',), ('ei', 'ne'), ('plu', 'ri', 'zen', 'tri', 'sche'), ('Spra', 'che'), (',',), ('ent', 'hält'), ('al', 'so'), ('meh', 're', 're'), ('Stan', 'dard', 'va', 'ri', 'e', 'tä', 'ten'), ('in',), ('ver', 'schie', 'de', 'nen'), ('Re', 'gi', 'o', 'nen'), ('.',)] + case 'ell': + assert syls_tokens == [('Η',), ('ελ', 'λη', 'νι', 'κή'), ('γλώσ', 'σα'), ('ανή', 'κει'), ('στην',), ('ιν', 'δο', 'ευ', 'ρω', 'παϊκή'), ('οι', 'κο', 'γένεια'), ('[',), ('9',), (']',), ('και',), ('απο', 'τε', 'λεί'), ('το',), ('μο', 'να', 'δι', 'κό'), ('μέλος',), ('του',), ('ελ', 'λη', 'νι', 'κού'), ('κλάδου',), (',',), ('ενώ',), ('εί', 'ναι'), ('η',), ('επί', 'ση', 'μη'), ('γλώσ', 'σα'), ('της',), ('Ελ', 'λάδας'), ('και',), ('της',), ('Κύ', 'πρου'), ('.',)] + case 'hun': + assert syls_tokens == [('A',), ('ma', 'gyar'), ('nyelv',), ('az',), ('urá', 'li'), ('nyelv', 'csa', 'lád'), ('tag', 'ja'), (',',), ('a',), ('finn', 'ugor'), ('nyel', 'vek'), ('kö', 'zé'), ('tar', 'to', 'zó'), ('ugor',), ('nyel', 'vek'), ('egyi', 'ke'), ('.',)] + case 'isl': + assert syls_tokens == [('Ís', 'lenska'), ('er',), ('vest', 'ur', 'nor', 'rænt'), (',',), ('germ', 'anskt'), ('og',), ('indó', 'evr', 'ópskt'), ('tungu', 'mál'), ('sem',), ('er',), ('eink', 'um'), ('tal', 'að'), ('og',), ('rit', 'að'), ('á',), ('Ís', 'landi'), ('og',), ('er',), ('móð', 'ur', 'mál'), ('lang', 'flestra'), ('Ís', 'lend', 'inga.'), ('[',), ('5',), (']',)] + case 'ind': + assert syls_tokens == [('Ba', 'ha', 'sa'), ('In', 'do', 'ne', 'sia'), ('ada', 'lah'), ('ba', 'ha', 'sa'), ('na', 'si', 'o', 'nal'), ('dan',), ('res', 'mi'), ('di',), ('se', 'lu', 'r', 'uh'), ('wi', 'la', 'yah'), ('In', 'do', 'ne', 'sia'), ('.',)] + case 'ita': + assert syls_tokens == [("L'i", 'ta', 'lia', 'no'), ('(',), ('[',), ('itaˈ', 'l', 'jaː', 'no'), (']',), ('[',), ('No', 'ta'), ('1',), (']',), ('ascol', 'taⓘ'), (')',), ('è',), ('una',), ('lin', 'gua'), ('ro', 'man', 'za'), ('par', 'la', 'ta'), ('prin', 'ci', 'pal', 'men', 'te'), ('in',), ('Ita', 'lia'), ('.',)] + case 'lit': + assert syls_tokens == [('Lie', 'tu', 'vių'), ('kal', 'ba'), ('–',), ('iš',), ('bal', 'tų'), ('pro', 'kal', 'bės'), ('ki', 'lu', 'si'), ('lie', 'tu', 'vių'), ('tau', 'tos'), ('kal', 'ba'), (',',), ('ku', 'ri'), ('Lie', 'tu', 'vo', 'je'), ('yra',), ('vals', 'ty', 'bi', 'nė'), (',',), ('o',), ('Eu', 'ro', 'pos'), ('Są', 'jun', 'go', 'je'), ('–',), ('vie', 'na'), ('iš',), ('ofi', 'cia', 'lių', 'jų'), ('kal', 'bų'), ('.',)] + case 'lav': + assert syls_tokens == [('Lat', 'vie', 'šu'), ('va', 'lo', 'da'), ('ir',), ('dzim', 'tā'), ('va', 'lo', 'da'), ('ap', 'mē', 'ram'), ('1,5',), ('mil', 'jo', 'niem'), ('cil', 'vē', 'ku'), (',',), ('gal', 've', 'no', 'kārt'), ('Lat', 'vi', 'jā'), (',',), ('kur',), ('tā',), ('ir',), ('vien', 'ī', 'gā'), ('valsts',), ('va', 'lo', 'da.'), ('[',), ('1',), (']',), ('[',), ('3',), (']',)] + case 'mon': + assert syls_tokens == [('Мон', 'гол'), ('хэл',), ('нь',), ('Мон', 'гол'), ('ул', 'сын'), ('ал', 'бан'), ('ёс', 'ны'), ('хэл',), ('юм',), ('.',)] + case 'nob': + assert syls_tokens == [('Bok', 'mål'), ('er',), ('en',), ('av',), ('to',), ('of', 'fi', 'si', 'el', 'le'), ('mål', 'for', 'mer'), ('av',), ('norsk',), ('skrift', 'språk'), (',',), ('hvor', 'av'), ('den',), ('and', 're'), ('er',), ('ny', 'norsk'), ('.',)] + case 'nno': + assert syls_tokens == [('Ny', 'norsk'), (',',), ('før',), ('1929',), ('of', 'fi', 'si', 'elt'), ('kal', 'la'), ('lands', 'mål'), (',',), ('er',), ('si', 'dan'), ('jam', 'stil', 'lings', 'ved', 'ta', 'ket'), ('av',), ('12',), ('.',), ('mai',), ('1885',), ('ei',), ('av',), ('dei',), ('to',), ('of', 'fi', 'si', 'el', 'le'), ('mål', 'for', 'me', 'ne'), ('av',), ('norsk',), (';',), ('den',), ('and', 're'), ('for', 'ma'), ('er',), ('bok', 'mål'), ('.',)] + case 'pol': + assert syls_tokens == [('Ję', 'zyk'), ('pol', 'ski'), (',',), ('pol', 'sz', 'czy', 'zna'), ('–',), ('ję', 'zyk'), ('z',), ('gru', 'py'), ('za', 'chod', 'nio', 'sło', 'wiań', 'skiej'), ('(',), ('do',), ('któ', 'rej'), ('na', 'le', 'żą'), ('rów', 'nież'), ('cze', 'ski'), (',',), ('ka', 'szub', 'ski'), (',',), ('sło', 'wac', 'ki'), ('i',), ('ję', 'zy', 'ki'), ('łu', 'życ', 'kie'), (')',), (',',), ('sta', 'no', 'wią', 'cej'), ('część',), ('ro', 'dzi', 'ny'), ('in', 'do', 'eu', 'ro', 'pej', 'skiej'), ('.',)] + case 'por_br' | 'por_pt': + assert syls_tokens == [('A',), ('lín', 'gua'), ('por', 'tu', 'gue', 'sa'), (',',), ('tam', 'bém'), ('de', 'sig', 'na', 'da'), ('por', 'tu', 'guês'), (',',), ('é',), ('uma',), ('lín', 'gua'), ('in', 'do', 'eu', 'ro', 'peia'), ('ro', 'mâ', 'ni', 'ca'), ('fle', 'xi', 'va'), ('oci', 'den', 'tal'), ('ori', 'gi', 'na', 'da'), ('no',), ('ga', 'le', 'go', 'por', 'tu', 'guês'), ('fa', 'la', 'do'), ('no',), ('Rei', 'no'), ('da',), ('Ga', 'li', 'za'), ('e',), ('no',), ('nor', 'te'), ('de',), ('Por', 'tu', 'gal'), ('.',)] + case 'ron': + assert syls_tokens == [('Lim', 'ba'), ('ro', 'mâ', 'nă'), ('es', 'te'), ('o',), ('lim', 'bă'), ('in', 'do', 'e', 'u', 'ro', 'pe', 'a', 'nă'), ('din',), ('gru', 'pul'), ('ita', 'lic'), ('și',), ('din',), ('sub', 'gru', 'pul'), ('orien', 'tal'), ('al',), ('lim', 'bi', 'lor'), ('ro', 'ma', 'ni', 'ce'), ('.',)] + case 'rus': + assert syls_tokens == [('Ру́с', 'ский'), ('язы́к',), ('(',), ('МФА',), (':',), ('[',), ('ˈruskʲɪi̯',), ('jɪˈzɨk',), (']',), ('ⓘ',), (')',), ('[',), ('~',), ('3',), (']',), ('[',), ('⇨',), (']',), ('—',), ('язык',), ('вос', 'точ', 'но', 'сла', 'вян', 'ской'), ('груп', 'пы'), ('сла', 'вян', 'ской'), ('вет', 'ви'), ('ин', 'до', 'ев', 'ро', 'пей', 'ской'), ('язы', 'ко', 'вой'), ('се', 'мьи'), (',',), ('на', 'ци', 'о', 'наль', 'ный'), ('язык',), ('рус', 'ско', 'го'), ('на', 'ро', 'да'), ('.',)] + case 'srp_cyrl': + assert syls_tokens == [('Срп', 'ски'), ('је', 'зик'), ('је',), ('зва', 'ни', 'чан'), ('у',), ('Ср', 'би', 'ји'), (',',), ('Бо', 'сни'), ('и',), ('Хер', 'це', 'го', 'ви', 'ни'), ('и',), ('Цр', 'ној'), ('Го', 'ри'), ('и',), ('го', 'во', 'ри'), ('га',), ('око',), ('12',), ('ми', 'ли', 'о', 'на'), ('љу', 'ди.'), ('[',), ('13',), (']',)] + case 'srp_latn': + assert syls_tokens == [('Srp', 'ski'), ('je', 'zik'), ('je',), ('zva', 'ni', 'čan'), ('u',), ('Sr', 'bi', 'ji'), (',',), ('Bo', 'sni'), ('i',), ('Her', 'ce', 'go', 'vi', 'ni'), ('i',), ('Cr', 'noj'), ('Go', 'ri'), ('i',), ('go', 'vo', 'ri'), ('ga',), ('oko',), ('12',), ('mi', 'li', 'o', 'na'), ('lju', 'di.'), ('[',), ('13',), (']',)] + case 'slk': + assert syls_tokens == [('Slo', 'ven', 'či', 'na'), ('je',), ('ofi', 'ciál', 'ne'), ('úrad', 'ným'), ('ja', 'zy', 'kom'), ('Slo', 'ven', 'ska'), (',',), ('Voj', 'vo', 'di', 'ny'), ('a',), ('od',), ('1',), ('.',), ('má', 'ja'), ('2004',), ('jed', 'ným'), ('z',), ('ja', 'zy', 'kov'), ('Európ', 'skej'), ('únie',), ('.',)] + case 'slv': + assert syls_tokens == [('Slo', 'ven', 'šči', 'na'), ('[',), ('slo', 'ˈʋe', 'nʃtʃi', 'na'), (']',), ('je',), ('zdru', 'že', 'ni'), ('na', 'ziv'), ('za',), ('ura', 'dni'), ('knji', 'žni'), ('je', 'zik'), ('Slo', 'ven', 'cev'), ('in',), ('sku', 'pno'), ('ime',), ('za',), ('na', 're', 'čja'), ('in',), ('go', 'vo', 're'), (',',), ('ki',), ('jih',), ('go', 'vo', 'ri', 'jo'), ('ali',), ('so',), ('jih',), ('ne', 'koč'), ('go', 'vo', 'ri', 'li'), ('Slo', 'ven', 'ci'), ('.',)] + case 'spa': + assert syls_tokens == [('El',), ('es', 'pa', 'ñol'), ('o',), ('cas', 'te', 'llano'), ('es',), ('una',), ('len', 'gua'), ('ro', 'man', 'ce'), ('pro', 'ce', 'den', 'te'), ('del',), ('la', 'tín'), ('ha', 'bla', 'do'), (',',), ('per', 'te', 'ne', 'cien', 'te'), ('a',), ('la',), ('fa', 'mi', 'lia'), ('de',), ('len', 'guas'), ('in', 'doeu', 'ro', 'peas'), ('.',)] + case 'swe': + assert syls_tokens == [('Svens', 'ka'), ('(',), ('svens', 'ka'), ('(',), ('in', 'fo'), (')',), (')',), ('är',), ('ett',), ('öst', 'nor', 'diskt'), ('språk',), ('som',), ('ta', 'las'), ('av',), ('un', 'ge', 'fär'), ('tio',), ('mil', 'jo', 'ner'), ('per', 'so', 'ner'), ('främst',), ('i',), ('Sve', 'ri', 'ge'), ('där',), ('språ', 'ket'), ('har',), ('en',), ('do', 'mi', 'nant'), ('ställ', 'ning'), ('som',), ('hu', 'vud', 'språk'), (',',), ('men',), ('även',), ('som',), ('det',), ('ena',), ('na', 'tio', 'nal', 'språ', 'ket'), ('i',), ('Fin', 'land'), ('och',), ('som',), ('en', 'da'), ('of', 'fi', 'ci', 'el', 'la'), ('språk',), ('på',), ('Åland',), ('.',)] + case 'tel': + assert syls_tokens == [('తె', 'లు', 'గు'), ('అనే', 'ది'), ('ద్రా', 'విడ'), ('భా', 'షల'), ('కు', 'టుం', 'బా', 'ని', 'కి'), ('చెం', 'దిన'), ('భాష',), ('.',)] + case 'tha': + assert syls_tokens == [('ภา', 'ษา', 'ไทย'), ('หรือ',), ('ภา', 'ษา', 'ไทย'), ('กลาง',), ('เป็น',), ('ภา', 'ษา'), ('ใน',), ('กลุ่ม',), ('ภา', 'ษา'), ('ไท',), ('ซึ่ง',), ('เป็น',), ('กลุ่ม', 'ย่อย'), ('ของ',), ('ตระ', 'กูล'), ('ภา', 'ษา'), ('ข',), ('ร้า',), ('-',), ('ไท',), ('และ',), ('เป็น',), ('ภา', 'ษา', 'ราช', 'การ'), ('และ',), ('ภา', 'ษา', 'ประ', 'จำ', 'ชาติ'), ('ของ',), ('ประ', 'เทศ'), ('ไทย',), ('[',), ('3',), ('][',), ('4',), (']',)] + case 'ukr': + assert syls_tokens == [('Укра', 'ї', '́', 'н', 'сь', 'ка'), ('мо', '́', 'ва'), ('(',), ('МФА',), (':',), ('[',), ('ukrɑ̽ˈjɪnʲsʲkɑ̽',), ('ˈmɔwɑ̽',), (']',), (',',), ('іс', 'то', 'ри', 'ч', 'ні'), ('на', 'зви'), ('—',), ('ру', '́', 'сь', 'ка'), ('[',), ('10',), (']',), ('[',), ('11',), (']',), ('[',), ('12',), (']',), ('[',), ('*',), ('1',), (']',), (')',), ('—',), ('на', 'ціо', 'на', 'ль', 'на'), ('мо', 'ва'), ('укра', 'ї', 'н', 'ців'), ('.',)] + case 'zul': + assert syls_tokens == [('Zu', 'lu'), ('/ˈzu', 'ːlu', 'ː/'), (',',), ('no', 'ma'), ('isi', 'Zu', 'lu'), ('wu', 'li', 'mi'), ('lwa', 'ba', 'ntu'), ('ba', 'se'), ('Ni', 'ngi', 'zi', 'mu'), ('neA', 'fri', 'ka'), ('aba', 'yi', 'ngxe', 'nye'), ('ya', 'ma', 'Ngu', 'ni'), ('.',)] + case _: + raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) - # Check for empty syllables - assert all(all(syls_token) for syls_token in syls_tokens) - - # The count of syllable should be more than the count of tokens - assert sum((len(syls_token) for syls_token in syls_tokens)) > len(tokens) - - # Length of syllabified tokens should be equal to or less than the length of tokens - assert len(syls_tokens) <= len(tokens) - - # Check for punctuation marks - assert not any(( - bool(len(syls) == 1 and wl_checks_tokens.is_punc(syls[0])) - for syls in syls_tokens - )) - - # Tagged texts - for syls_tagged in syls_tokens_tagged: - syls_tagged[-1] = syls_tagged[-1].replace('_TEST', '') - - assert syls_tokens_tagged == syls_tokens - - # Long texts - assert syls_tokens_long == [[str(i)] for i in range(101) for j in range(50)] + if tests_lang_util_skipped: + raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(syl_tokenizer) if __name__ == '__main__': for lang, syl_tokenizer in test_syl_tokenizers: test_syl_tokenize(lang, syl_tokenizer) - - for lang, syl_tokenizer in test_syl_tokenizers: - test_syl_tokenize_tokens_no_punc(lang, syl_tokenizer) diff --git a/tests/tests_nlp/test_word_detokenization.py b/tests/tests_nlp/test_word_detokenization.py index 62d24c833..7d032b45f 100644 --- a/tests/tests_nlp/test_word_detokenization.py +++ b/tests/tests_nlp/test_word_detokenization.py @@ -38,21 +38,21 @@ @pytest.mark.parametrize('lang', test_langs) def test_word_detokenize(lang): - if lang.startswith('zho_'): - text = '英国全称是United Kingdom of Great Britain,由四个部分组成:England、Scotland、Wales和Northern Ireland' - elif lang == 'jpn': - text = '''The meaning of "天気がいいから、散歩しましょう。" is: The weather is good so let's take a walk.''' - elif lang == 'bod': - text = 'Test this Tibetan string: དུང་དང་འོ་མར་འགྲན་པའི་ལྷག་བསམ་མཐུ། །དམན་ཡང་དཀར་པོའི་བྱས་འབྲས་ཅུང་ཟད་ཅིག །བློ་དང་འདུན་པ་བཟང་བའི་རང་རིགས་ཀུན། །རྒྱལ་ཁའི་འཕྲིན་བཟང་ལས་དོན་འགྲུབ་ཕྱིར་འབད།།. Does detokenization work as expected?' - else: - text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') + match lang: + case 'zho_cn' | 'zho_tw': + text = '英国全称是United Kingdom of Great Britain,由四个部分组成:England、Scotland、Wales和Northern Ireland' + case 'jpn': + text = '''The meaning of "天気がいいから、散歩しましょう。" is: The weather is good so let's take a walk.''' + case 'bod': + text = 'Test this Tibetan string: དུང་དང་འོ་མར་འགྲན་པའི་ལྷག་བསམ་མཐུ། །དམན་ཡང་དཀར་པོའི་བྱས་འབྲས་ཅུང་ཟད་ཅིག །བློ་དང་འདུན་པ་བཟང་བའི་རང་རིགས་ཀུན། །རྒྱལ་ཁའི་འཕྲིན་བཟང་ལས་དོན་འགྲུབ་ཕྱིར་འབད།།. Does detokenization work as expected?' + case _: + text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') tokens = wl_word_tokenization.wl_word_tokenize_flat( main, text = text, lang = lang ) - text = wl_word_detokenization.wl_word_detokenize( main, tokens = tokens, @@ -62,18 +62,19 @@ def test_word_detokenize(lang): print(f'{lang}:') print(f'{text}\n') - if lang.startswith('zho_'): - assert text == '英国全称是United Kingdom of Great Britain,由四个部分组成:England、Scotland、Wales和Northern Ireland' - elif lang in ['eng_us', 'other']: - assert text == 'English is a West Germanic language in the Indo-European language family.' - elif lang == 'jpn': - assert text == '''The meaning of "天気がいいから、散歩しましょう。"is: The weather is good so let 's take a walk.''' - elif lang == 'tha': - assert text == 'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาในกลุ่มภาษาไทซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า - ไทและเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย [ 3 ][ 4 ]' - elif lang == 'bod': - assert text == 'Test this Tibetan string: དུང་དང་འོ་མར་འགྲན་པའི་ལྷག་བསམ་མཐུ། །དམན་ཡང་དཀར་པོའི་བྱས་འབྲས་ཅུང་ཟད་ཅིག །བློ་དང་འདུན་པ་བཟང་བའི་རང་རིགས་ཀུན། །རྒྱལ་ཁའི་འཕྲིན་བཟང་ལས་དོན་འགྲུབ་ཕྱིར་འབད།།' - else: - raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + match lang: + case 'zho_cn' | 'zho_tw': + assert text == '英国全称是United Kingdom of Great Britain,由四个部分组成:England、Scotland、Wales和Northern Ireland' + case 'eng_us' | 'other': + assert text == 'English is a West Germanic language in the Indo-European language family.' + case 'jpn': + assert text == '''The meaning of "天気がいいから、散歩しましょう。"is: The weather is good so let 's take a walk.''' + case 'tha': + assert text == 'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาในกลุ่มภาษาไทซึ่งเป็นกลุ่มย่อยของตระกูลภาษาขร้า - ไทและเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย [ 3 ][ 4 ]' + case 'bod': + assert text == 'Test this Tibetan string: དུང་དང་འོ་མར་འགྲན་པའི་ལྷག་བསམ་མཐུ། །དམན་ཡང་དཀར་པོའི་བྱས་འབྲས་ཅུང་ཟད་ཅིག །བློ་དང་འདུན་པ་བཟང་བའི་རང་རིགས་ཀུན། །རྒྱལ་ཁའི་འཕྲིན་བཟང་ལས་དོན་འགྲུབ་ཕྱིར་འབད།།' + case _: + raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) if __name__ == '__main__': for lang in test_langs_local: diff --git a/tests/tests_nlp/test_word_tokenization.py b/tests/tests_nlp/test_word_tokenization.py index 346c0bfe8..3a088e31b 100644 --- a/tests/tests_nlp/test_word_tokenization.py +++ b/tests/tests_nlp/test_word_tokenization.py @@ -19,7 +19,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import wl_word_tokenization +from wordless.wl_nlp import wl_texts, wl_word_tokenization from wordless.wl_utils import wl_misc _, is_macos, _ = wl_misc.check_os() @@ -68,6 +68,7 @@ def test_word_tokenize(lang, word_tokenizer): lang = lang, word_tokenizer = word_tokenizer ) + tokens = wl_texts.to_display_texts(tokens) print(f'{lang} / {word_tokenizer}:') print(f'{tokens}\n') @@ -82,249 +83,256 @@ def test_word_tokenize(lang, word_tokenizer): tests_lang_util_skipped = False - if lang == 'afr': - assert tokens == ['Afrikaans', 'is', 'tipologies', 'beskou', "'", 'n', 'Indo', '-', 'Europese', ',', 'Wes', '-', 'Germaanse', ',', 'Nederfrankiese', 'taal,[2', ']', 'wat', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', 'ontstaan', 'het', '.'] - elif lang == 'sqi': - assert tokens == ['Gjuha', 'shqipe', '(', 'ose', 'thjesht', 'shqipja', ')', 'është', 'gjuhë', 'dhe', 'degë', 'e', 'veçantë', 'e', 'familjes', 'indo', '-', 'evropiane', 'që', 'flitet', 'nga', 'rreth', '7', '-', '10', 'milionë', 'njerëz', 'në', 'botë,[1', ']', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'dhe', 'Maqedoninë', 'e', 'Veriut', ',', 'por', 'edhe', 'në', 'zona', 'të', 'tjera', 'të', 'Evropës', 'Juglindore', 'ku', 'ka', 'një', 'popullsi', 'shqiptare', ',', 'duke', 'përfshirë', 'Malin', 'e', 'Zi', 'dhe', 'Luginën', 'e', 'Preshevës', '.'] - elif lang == 'amh': - assert tokens == ['አማርኛ[1', ']', '፡', 'የኢትዮጵያ', '፡', 'መደበኛ', '፡', 'ቋንቋ', '፡', 'ነው', '።'] - elif lang == 'ara': - assert tokens == ['ٱللُّغَةُ', 'ٱلْعَرَبِيَّة', 'هي', 'أكثر', 'اللغات', 'السامية', 'تحدثًا', '،', 'وإحدى', 'أكثر', 'اللغات', 'انتشاراً', 'في', 'العالم', '،', 'يتحدثها', 'أكثر', 'من', '467', 'مليون', 'نسمة.(1', ')'] - elif lang == 'hye': - assert tokens == ['Հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն։', 'Գրաբարով', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը։'] - elif lang == 'asm': - assert tokens == ['অসমীয়া', 'ভাষা', 'হৈছে', 'সকলোতকৈ', 'পূৰ্বীয়', 'ভাৰতীয়-আৰ্য', 'ভাষা', '।'] - elif lang == 'aze': - assert tokens == ['Azərbaycan', 'dili[2][3', ']', '(', 'Cənubi', 'Azərbaycanda', ':', 'Türk', 'dili[4][5', ']', ')', '—', 'Azərbaycan', 'Respublikasının', 'və', 'Rusiya', 'Federasiyası', 'Dağıstan', 'Respublikasının[6', ']', 'rəsmi', 'dövlət', 'dili', '.'] - elif lang == 'eus': - assert tokens == ['Euskara', 'Euskal', 'Herriko', 'hizkuntza', 'da.[8', ']'] - elif lang == 'ben': - if word_tokenizer == 'sacremoses_moses': - assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামেও', 'পরিচিত', ')', 'একটি', 'ইন্দো-আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।'] - elif word_tokenizer == 'spacy_ben': - assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।'] - else: - tests_lang_util_skipped = True - elif lang == 'bul': - assert tokens == ['Бъ̀лгарският', 'езѝк', 'е', 'индоевропейски', 'език', 'от', 'групата', 'на', 'южнославянските', 'езици', ',', 'като', 'образува', 'неговата', 'източна', 'подгрупа', '.'] - elif lang == 'cat': - assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'del', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', 'd', "'", 'algunes', 'comarques', 'i', 'localitats', 'de', 'l', "'", 'interior', ')', ',', 'les', 'Illes', 'Balears', '(', 'on', 'també', 'rep', 'el', 'nom', 'de', 'mallorquí', ',', 'menorquí', ',', 'eivissenc', 'o', 'formenterer', 'segons', 'l', "'", 'illa', ')', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', 'l', "'", 'Aragó', ')', ',', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', '(', 'a', 'l', "'", 'illa', 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[', '8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'habitat', 'per', 'pobladors', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', 'l', "'", 'Argentina', ',', 'amb', '200.000', 'parlants', ')', '.', '[', '11', ']'] - elif lang == 'zho_cn': - if word_tokenizer == 'pkuseg_zho': - assert tokens == ['汉语', '又', '称', '中文', '、', '华语', '[', '6', ']', '、', '唐', '话[', '7]', ',', '概指', '由', '上古', '汉语', '(', '先秦', '雅言', ')', '发展', '而', '来', '、', '书面', '使用', '汉字', '的', '分析语', ',', '为', '汉藏', '语系', '最', '大', '的', '一', '支', '语族', '。'] - elif word_tokenizer == 'wordless_zho_char': - assert tokens == ['汉', '语', '又', '称', '中', '文', '、', '华', '语', '[', '6', ']', '、', '唐', '话', '[', '7', ']', ',', '概', '指', '由', '上', '古', '汉', '语', '(', '先', '秦', '雅', '言', ')', '发', '展', '而', '来', '、', '书', '面', '使', '用', '汉', '字', '的', '分', '析', '语', ',', '为', '汉', '藏', '语', '系', '最', '大', '的', '一', '支', '语', '族', '。'] - else: - tests_lang_util_skipped = True - elif lang == 'zho_tw': - if word_tokenizer == 'pkuseg_zho': - assert tokens == ['漢語', '又', '稱', '中文', '、', '華', '語[', '6', ']', '、', '唐', '話[', '7]', ',', '概指', '由', '上古', '漢語', '(', '先秦', '雅言', ')', '發展', '而', '來', '、', '書面', '使用', '漢字', '的', '分析', '語', ',', '為漢', '藏語系', '最', '大', '的', '一', '支', '語族', '。'] - elif word_tokenizer == 'wordless_zho_char': - assert tokens == ['漢', '語', '又', '稱', '中', '文', '、', '華', '語', '[', '6', ']', '、', '唐', '話', '[', '7', ']', ',', '概', '指', '由', '上', '古', '漢', '語', '(', '先', '秦', '雅', '言', ')', '發', '展', '而', '來', '、', '書', '面', '使', '用', '漢', '字', '的', '分', '析', '語', ',', '為', '漢', '藏', '語', '系', '最', '大', '的', '一', '支', '語', '族', '。'] - else: - tests_lang_util_skipped = True - elif lang == 'hrv': - assert tokens == ['Hrvatski', 'jezik', '(', 'ISO', '639', '-', '3', ':', 'hrv', ')', 'skupni', 'je', 'naziv', 'za', 'nacionalni', 'standardni', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječja', 'i', 'govora', 'kojima', 'govore', 'ili', 'su', 'nekada', 'govorili', 'Hrvati', '.'] - elif lang == 'ces': - assert tokens == ['Čeština', 'neboli', 'český', 'jazyk', 'je', 'západoslovanský', 'jazyk', ',', 'nejbližší', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.'] - elif lang == 'dan': - assert tokens == ['Dansk', 'er', 'et', 'østnordisk', 'sprog', 'indenfor', 'den', 'germanske', 'gren', 'af', 'den', 'indoeuropæiske', 'sprogfamilie', '.'] - elif lang == 'nld': - assert tokens == ['Het', 'Nederlands', 'is', 'een', 'West-Germaanse', 'taal', ',', 'de', 'meest', 'gebruikte', 'taal', 'in', 'Nederland', 'en', 'België', ',', 'de', 'officiële', 'taal', 'van', 'Suriname', 'en', 'een', 'van', 'de', 'drie', 'officiële', 'talen', 'van', 'België', '.'] - elif lang.startswith('eng_') or lang == 'other': - if word_tokenizer in ['nltk_nist', 'nltk_regex']: - assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo', '-', 'European', 'language', 'family', '.'] - elif word_tokenizer in ['nltk_nltk', 'nltk_penn_treebank', 'nltk_tok_tok', 'nltk_twitter', 'sacremoses_moses']: - assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'est': - assert tokens == ['Eesti', 'keelel', 'on', 'kaks', 'suuremat', 'murderühma', '(', 'põhjaeesti', 'ja', 'lõunaeesti', ')', ',', 'mõnes', 'käsitluses', 'eristatakse', 'ka', 'kirderanniku', 'murdeid', 'eraldi', 'murderühmana', '.'] - elif lang == 'fin': - assert tokens == ['Suomen', 'kieli', 'eli', 'suomi', 'on', 'uralilaisten', 'kielten', 'itämerensuomalaiseen', 'ryhmään', 'kuuluva', 'kieli', ',', 'jota', 'puhuvat', 'pääosin', 'suomalaiset', '.'] - elif lang == 'fra': - assert tokens == ['Le', 'français', 'est', 'une', 'langue', 'indo-européenne', 'de', 'la', 'famille', 'des', 'langues', 'romanes', 'dont', 'les', 'locuteurs', 'sont', 'appelés', 'francophones', '.'] - elif lang == 'lug': - assert tokens == ['Luganda', '/', 'Oluganda', 'lwe', 'lulimi', 'olwogerwa', 'Abaganda', 'e', 'Yuganda', '.'] - elif lang.startswith('deu_'): - assert tokens == ['Das', 'Deutsche', 'ist', 'eine', 'plurizentrische', 'Sprache', ',', 'enthält', 'also', 'mehrere', 'Standardvarietäten', 'in', 'verschiedenen', 'Regionen', '.'] - elif lang == 'grc': - assert tokens == ['ἦλθον', 'δὲ', 'οἱ', 'δύο', 'ἄγγελοι', 'εἰς', 'Σόδομα', 'ἑσπέρας', '·', 'Λὼτ', 'δὲ', 'ἐκάθητο', 'παρὰ', 'τὴν', 'πύλην', 'Σοδόμων', '.', 'ἰδὼν', 'δὲ', 'Λὼτ', 'ἐξανέστη', 'εἰς', 'συνάντησιν', 'αὐτοῖς', 'καὶ', 'προσεκύνησεν', 'τῷ', 'προσώπῳ', 'ἐπὶ', 'τὴν', 'γῆν'] - elif lang == 'ell': - assert tokens == ['Η', 'ελληνική', 'γλώσσα', 'ανήκει', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια', '[', '9', ']', 'και', 'αποτελεί', 'το', 'μοναδικό', 'μέλος', 'του', 'ελληνικού', 'κλάδου', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδας', 'και', 'της', 'Κύπρου', '.'] - elif lang == 'guj': - if word_tokenizer == 'sacremoses_moses': - assert tokens == ['ગુજરાતી', '\u200d', '(', '/', 'ɡʊdʒəˈrɑːti', '/', '[', '૭', ']', ',', 'રોમન', 'લિપિમાં', ':', 'Gujarātī', ',', 'ઉચ્ચાર', ':', '[', 'ɡudʒəˈɾɑːtiː', ']', ')', 'ભારત', 'દેશના', 'ગુજરાત', 'રાજ્યની', 'ઇન્ડો-આર્યન', 'ભાષા', 'છે', ',', 'અને', 'મુખ્યત્વે', 'ગુજરાતી', 'લોકો', 'દ્વારા', 'બોલાય', 'છે', '.'] - elif word_tokenizer == 'spacy_guj': - assert tokens == ['ગુજરાતી', '\u200d(/ɡʊdʒəˈrɑːti/[૭', ']', ',', 'રોમન', 'લિપિમાં', ':', 'Gujarātī', ',', 'ઉચ્ચાર', ':', '[', 'ɡudʒəˈɾɑːtiː', ']', ')', 'ભારત', 'દેશના', 'ગુજરાત', 'રાજ્યની', 'ઇન્ડો-આર્યન', 'ભાષા', 'છે', ',', 'અને', 'મુખ્યત્વે', 'ગુજરાતી', 'લોકો', 'દ્વારા', 'બોલાય', 'છે.'] - else: - tests_lang_util_skipped = True - elif lang == 'heb': - assert tokens == ['עִבְרִית', 'היא', 'שפה', 'שמית', ',', 'ממשפחת', 'השפות', 'האפרו', '-', 'אסיאתיות', ',', 'הידועה', 'כשפתם', 'של', 'היהודים', 'ושל', 'השומרונים', '.'] - elif lang == 'hin': - assert tokens == ['हिन्दी', 'जिसके', 'मानकीकृत', 'रूप', 'को', 'मानक', 'हिन्दी', 'कहा', 'जाता', 'है', ',', 'विश्व', 'की', 'एक', 'प्रमुख', 'भाषा', 'है', 'और', 'भारत', 'की', 'एक', 'राजभाषा', 'है', '।'] - elif lang == 'hun': - assert tokens == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tagja', ',', 'a', 'finnugor', 'nyelvek', 'közé', 'tartozó', 'ugor', 'nyelvek', 'egyike', '.'] - elif lang == 'isl': - if word_tokenizer == 'sacremoses_moses': - assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga', '.', '[', '5', ']'] - elif word_tokenizer == 'spacy_isl': - assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga.[5', ']'] - else: - tests_lang_util_skipped = True - elif lang == 'ind': - assert tokens == ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'nasional', 'dan', 'resmi', 'di', 'seluruh', 'wilayah', 'Indonesia', '.'] - elif lang == 'gle': - assert tokens == ['Is', 'ceann', 'de', 'na', 'teangacha', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'de', 'na', 'trí', 'cinn', 'de', 'theangacha', 'Ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'Gaelacha', '(', 'Gaeilge', ',', 'Gaeilge', 'Mhanann', 'agus', 'Gaeilge', 'na', 'hAlban', ')', 'go', 'háirithe', '.'] - elif lang == 'ita': - assert tokens == ["L'", 'italiano', '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascoltaⓘ', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.'] - elif lang == 'jpn': - if word_tokenizer == 'sudachipy_jpn_split_mode_a': - assert tokens == ['日本', '語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注釈', '2', ']', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だっ', 'た', '国', '、', 'そして', '国外', '移民', 'や', '移住', '者', 'を', '含む', '日本', '人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', '。'] - elif word_tokenizer in [ - 'sudachipy_jpn_split_mode_b', - 'sudachipy_jpn_split_mode_c' - ]: - assert tokens == ['日本語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注釈', '2', ']', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だっ', 'た', '国', '、', 'そして', '国外', '移民', 'や', '移住者', 'を', '含む', '日本人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', '。'] - elif word_tokenizer == 'wordless_jpn_kanji': - assert tokens == ['日', '本', '語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注', '釈', '2', ']', ')', 'は', '、', '日', '本', '国', '内', 'や', '、', 'かつて', 'の', '日', '本', '領', 'だっ', 'た', '国', '、', 'そして', '国', '外', '移', '民', 'や', '移', '住', '者', 'を', '含', 'む', '日', '本', '人', '同', '士', 'の', '間', 'で', '使', '用', 'さ', 'れ', 'て', 'いる', '言', '語', '。'] - else: - tests_lang_util_skipped = True - elif lang == 'kan': - assert tokens == ['ದ್ರಾವಿಡ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಪ್ರಾಮುಖ್ಯವುಳ್ಳ', 'ಭಾಷೆಯೂ', 'ಭಾರತದ', 'ಪುರಾತನವಾದ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಒಂದೂ', 'ಆಗಿರುವ', 'ಕನ್ನಡ', 'ಭಾಷೆಯನ್ನು', 'ಅದರ', 'ವಿವಿಧ', 'ರೂಪಗಳಲ್ಲಿ', 'ಸುಮಾರು', '೪೫', 'ದಶಲಕ್ಷ', 'ಜನರು', 'ಆಡು', 'ನುಡಿಯಾಗಿ', 'ಬಳಸುತ್ತಲಿದ್ದಾರೆ', '.'] - elif lang == 'khm': - assert tokens == ['ភាសា', 'ខ្មែរ', 'គឺជា', 'ភាសា', 'កំណើត', 'របស់', 'ជនជាតិ', 'ខ្មែរ', 'និង', 'ជា', 'ភាសា', 'ផ្លូវការ', 'របស់', 'ប្រទេស', 'កម្ពុជា', '។'] - elif lang == 'kor': - assert tokens == ['세계', '여러', '지역', '에', '한', '민족', '인구', '가', '거주', '하', '게', '되', '면서', '전', '세계', '각지', '에서', '한국어', '가', '사용', '되', '고', '있', '다', '.'] - elif lang == 'kir': - assert tokens == ['Кыргыз', 'тили', '—', 'Кыргыз', 'Республикасынын', 'мамлекеттик', 'тили', ',', 'түрк', 'тилдеринин', 'курамына', ',', 'анын', 'ичинде', 'кыргыз-кыпчак', 'же', 'тоо-алтай', 'тобуна', 'кирет', '.'] - elif lang == 'lao': - assert tokens == ['ພາສາລາວ', '(', 'Lao', ':', 'ລາວ', ',', '[', 'l', 'áː', 'w', ']', 'ຫຼື', 'ພາສາລາວ', ',', '[', 'p', 'ʰáː', 's', 'ǎː', 'l', 'áː', 'w', '])', 'ເປັນ', 'ພາສາ', 'ຕະກູນ', 'ໄທ', '-', 'ກະໄດ', 'ຂອງ', 'ຄົນ', 'ລາວ', 'ໂດຍ', 'ມີ', 'ຄົນ', 'ເວົ້າ', 'ໃນປະເທດລາວ', 'ເຊິ່ງ', 'ເປັນ', 'ພາສາ', 'ລັດຖະການ', 'ຂອງ', 'ສາທາລະນະລັດ', 'ປະຊາທິປະໄຕ', 'ປະຊາຊົນ', 'ລາວ', 'ຂອງ', 'ປະຊາກອນ', 'ປະມານ', '7', 'ລ້ານ', 'ຄົນ', 'ແລະ', 'ໃນ', 'ພື້ນທີ່', 'ພາກ', 'ຕາເວັນອອກສຽງ', 'ເໜືອ', 'ຂອງ', 'ປະເທດໄທ', 'ທີ່ມີ', 'ຄົນ', 'ເວົ້າ', 'ປະມານ', '23', 'ລ້ານ', 'ຄົນ', 'ທາງ', 'ລັດຖະບານ', 'ປະເທດໄທ', 'ມີການສະໜັບສະໜຸນ', 'ໃຫ້', 'ເອີ້ນ', 'ພາສາລາວ', 'ຖິ່ນ', 'ໄທ', 'ວ່າ', 'ພາສາລາວ', 'ຖິ່ນ', 'ອີສານ', 'ນອກຈາກ', 'ນີ້', ',', 'ຢູ່', 'ທາງ', 'ພາກ', 'ຕາເວັນອອກສຽງ', 'ເໜືອ', 'ຂອງ', 'ປະເທດກຳປູເຈຍ', 'ກໍ', 'ມີ', 'ຄົນ', 'ເວົ້າ', 'ພາສາລາວ', 'ຄືກັນ', '.'] - elif lang == 'lat': - assert tokens == ['Lingua', 'Latina,[1', ']', 'sive', 'sermo', 'Latinus,[2', ']', 'est', 'lingua', 'Indoeuropaea', 'qua', 'primum', 'Latini', 'universi', 'et', 'Romani', 'antiqui', 'in', 'primis', 'loquebantur', 'quamobrem', 'interdum', 'etiam', 'lingua', 'Latia[3', ']', '(', 'in', 'Latio', 'enim', 'sueta', ')', 'et', 'lingua', 'Romana[4', ']', '(', 'nam', 'imperii', 'Romani', 'sermo', 'sollemnis', ')', 'appellatur', '.'] - elif lang == 'lav': - if word_tokenizer == 'sacremoses_moses': - assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,5', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda', '.', '[', '1', ']', '[', '3', ']'] - elif word_tokenizer == 'spacy_lav': - assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,5', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda.[1][3', ']'] - else: - tests_lang_util_skipped = True - elif lang == 'lij': - assert tokens == ['O', 'baxin', "d'", 'influensa', 'di', 'dialetti', 'lìguri', 'o', "l'", 'é', 'de', 'çirca', '2', 'milioìn', 'de', 'personn', '-', 'e', 'anche', 'se', ',', 'specialmente', 'inti', 'ùrtimi', "çinquant'", 'anni', ',', 'pe', 'coscì', 'de', 'variante', 'locali', 'se', 'son', 'pèrse', 'e', 'de', 'âtre', 'son', 'a', 'reizego', "tutt'", 'òua', ',', 'anche', 'pe', 'córpa', 'da', 'mancansa', 'de', "'", 'n', 'pâ', 'de', 'generaçioin', 'inta', 'continoasion', 'da', 'parlâ', '.'] - elif lang == 'lit': - assert tokens == ['Lietuvių', 'kalba', '–', 'iš', 'baltų', 'prokalbės', 'kilusi', 'lietuvių', 'tautos', 'kalba', ',', 'kuri', 'Lietuvoje', 'yra', 'valstybinė', ',', 'o', 'Europos', 'Sąjungoje', '–', 'viena', 'iš', 'oficialiųjų', 'kalbų', '.'] - elif lang == 'ltz': - assert tokens == ["D'", 'Lëtzebuergesch', 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.'] - elif lang == 'mkd': - assert tokens == ['Македонски', 'јазик', '—', 'јужнословенски', 'јазик', ',', 'дел', 'од', 'групата', 'на', 'словенски', 'јазици', 'од', 'јазичното', 'семејство', 'на', 'индоевропски', 'јазици', '.'] - elif lang == 'msa': - assert tokens == ['Bahasa', 'Melayu', '(', 'Tulisan', 'Jawi', ':', 'بهاس', 'ملايو', ';', 'Rencong', ':', 'ꤷꥁꤼ', 'ꤸꥍꤾꤿꥈ', ')', 'ialah', 'salah', 'satu', 'daripada', 'bahasa', '-', 'bahasa', 'Melayu', '-', 'Polinesia', 'di', 'bawah', 'keluarga', 'bahasa', 'Austronesia', ',', 'yang', 'merupakan', 'bahasa', 'rasmi', 'di', 'Brunei', ',', 'Indonesia', ',', 'Malaysia', 'dan', 'Singapura', ',', 'serta', 'dituturkan', 'di', 'Timor', 'Leste', 'dan', 'sebahagian', 'wilayah', 'di', 'Kemboja', ',', 'Filipina', 'dan', 'Thailand', '.'] - elif lang == 'mal': - if word_tokenizer == 'sacremoses_moses': - assert tokens == ['ഇന്ത്യയിൽ', 'കേരള', 'സംസ്ഥാനത്തിലും', 'കേന്ദ്രഭരണപ്രദേശങ്ങളായ', 'ലക്ഷദ്വീപിലും', 'പോണ്ടിച്ചേരിയുടെ', 'ഭാഗമായ', 'മാഹിയിലും', 'തമിഴ്നാട്ടിലെ', 'കന്യാകുമാരി', 'ജില്ലയിലും', 'നീലഗിരി', 'ജില്ലയിലെ', 'ഗൂഡല്ലൂർ', 'താലൂക്കിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം', '.'] - elif word_tokenizer == 'spacy_mal': - assert tokens == ['ഇന്ത്യയിൽ', 'കേരള', 'സംസ്ഥാനത്തിലും', 'കേന്ദ്രഭരണപ്രദേശങ്ങളായ', 'ലക്ഷദ്വീപിലും', 'പോണ്ടിച്ചേരിയുടെ', 'ഭാഗമായ', 'മാഹിയിലും', 'തമിഴ്നാട്ടിലെ', 'കന്യാകുമാരി', 'ജില്ലയിലും', 'നീലഗിരി', 'ജില്ലയിലെ', 'ഗൂഡല്ലൂർ', 'താലൂക്കിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം.'] - else: - tests_lang_util_skipped = True - elif lang == 'mar': - if word_tokenizer == 'sacremoses_moses': - assert tokens == ['मराठी', 'भाषा', 'ही', 'इंडो-युरोपीय', 'भाषाकुळातील', 'एक', 'भाषा', 'आहे', '.'] - elif word_tokenizer == 'spacy_mar': - assert tokens == ['मराठी', 'भाषा', 'ही', 'इंडो', '-', 'युरोपीय', 'भाषाकुळातील', 'एक', 'भाषा', 'आहे', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'mni_mtei': - assert tokens == ['ꯃꯤꯇꯩꯂꯣꯟ', '(', 'ꯃꯤꯇꯩꯂꯣꯜ', ')', 'ꯅꯠꯇ', '꯭', 'ꯔꯒ', 'ꯃꯩꯇꯩꯂꯣꯟ', '(', 'ꯃꯩꯇꯩꯂꯣꯜ', ')', 'ꯅꯠꯇ', '꯭', 'ꯔꯒ', 'ꯃꯅꯤꯄꯨꯔꯤ', 'ꯂꯣꯟ', '(', 'ꯃꯅꯤꯄꯨꯔꯤ', 'ꯂꯣꯜ', ')', 'ꯑꯁꯤ', 'ꯑꯋꯥꯡ-ꯅꯣꯡꯄꯣꯛ', 'ꯏꯟꯗꯤꯌꯥꯒꯤ', 'ꯃꯅꯤꯄꯨꯔꯗ', 'ꯃꯄꯨꯡ', 'ꯑꯣꯢꯅ', 'ꯉꯥꯡꯅꯕ', 'ꯂꯣꯟ', 'ꯑꯃꯅꯤ', '꯫'] - elif lang == 'nep': - assert tokens == ['नेपाली', 'भाषा', '(', 'अन्तर्राष्ट्रिय', 'ध्वन्यात्मक', 'वर्णमाला', '[', 'neˈpali', 'bʱaʂa', ']', ')', 'नेपालको', 'सम्पर्क', 'भाषा', 'तथा', 'भारत', ',', 'भुटान', 'र', 'म्यानमारको', 'केही', 'भागमा', 'मातृभाषाको', 'रूपमा', 'बोलिने', 'भाषा', 'हो', '।'] - elif lang == 'nob': - assert tokens == ['Bokmål', 'er', 'en', 'varietet', 'av', 'norsk', 'skriftspråk', '.'] - elif lang == 'ori': - assert tokens == ['ଓଡ଼ିଆ', '(', 'ଇଂରାଜୀ', 'ଭାଷାରେ', 'Odia', '/', 'əˈdiːə', '/', 'or', 'Oriya', '/', 'ɒˈriːə', '/', ',', ')', 'ଏକ', 'ଭାରତୀୟ', 'ଭାଷା', 'ଯାହା', 'ଏକ', 'ଇଣ୍ଡୋ-ଇଉରୋପୀୟ', 'ଭାଷାଗୋଷ୍ଠୀ', 'ଅନ୍ତର୍ଗତ', 'ଇଣ୍ଡୋ-ଆର୍ଯ୍ୟ', 'ଭାଷା', '।'] - elif lang == 'fas': - assert tokens == ['فارسی', 'یا', 'پارسی', 'یک', 'زبان', 'ایرانی', 'غربی', 'از', 'زیرگروه', 'ایرانی', 'شاخهٔ', 'هندوایرانیِ', 'خانوادهٔ', 'زبان\u200cهای', 'هندواروپایی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', 'تاجیکستان', '،', 'ازبکستان', '،', 'پاکستان', '،', 'عراق', '،', 'ترکمنستان', 'و', 'آذربایجان', 'به', 'آن', 'سخن', 'می\u200cگویند', '.'] - elif lang == 'pol': - assert tokens == ['Język', 'polski', ',', 'polszczyzna', '–', 'język', 'z', 'grupy', 'zachodniosłowiańskiej', '(', 'do', 'której', 'należą', 'również', 'czeski', ',', 'kaszubski', ',', 'słowacki', 'i', 'języki', 'łużyckie', ')', ',', 'stanowiącej', 'część', 'rodziny', 'indoeuropejskiej', '.'] - elif lang.startswith('por_'): - assert tokens == ['A', 'língua', 'portuguesa', ',', 'também', 'designada', 'português', ',', 'é', 'uma', 'língua', 'indo-europeia', 'românica', 'flexiva', 'ocidental', 'originada', 'no', 'galego-português', 'falado', 'no', 'Reino', 'da', 'Galiza', 'e', 'no', 'norte', 'de', 'Portugal', '.'] - elif lang == 'pan_guru': - assert tokens == ['ਪੰਜਾਬੀ', 'ਭਾਸ਼ਾ', '(', 'ਸ਼ਾਹਮੁਖੀ', ':', '\u200e', 'پنجابی', ',', 'ਪੰਜਾਬੀ', ')', 'ਪੰਜਾਬ', 'ਦੀ', 'ਭਾਸ਼ਾ', 'ਹੈ', ',', 'ਜਿਸ', 'ਨੂੰ', 'ਪੰਜਾਬ', 'ਖੇਤਰ', 'ਦੇ', 'ਵਸਨੀਕ', 'ਜਾਂ', 'ਸੰਬੰਧਿਤ', 'ਲੋਕ', 'ਬੋਲਦੇ', 'ਹਨ', '।', '[', '18', ']'] - elif lang == 'ron': - assert tokens == ['Limba', 'română', 'este', 'o', 'limbă', 'indo-europeană', 'din', 'grupul', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbilor', 'romanice', '.'] - elif lang == 'rus': - if word_tokenizer == 'nltk_tok_tok': - assert tokens == ['Ру́сский', 'язы́к', '(', 'МФА', ':', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянской', 'группы', 'славянской', 'ветви', 'индоевропейской', 'языковой', 'семьи', ',', 'национальный', 'язык', 'русского', 'народа', '.'] - elif word_tokenizer == 'sacremoses_moses': - assert tokens == ['Ру', '́', 'сский', 'язы', '́', 'к', '(', 'МФА', ':', '[', 'ˈruskʲɪi', '̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянской', 'группы', 'славянской', 'ветви', 'индоевропейской', 'языковой', 'семьи', ',', 'национальный', 'язык', 'русского', 'народа', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'san': - assert tokens == ['संस्कृतम्', 'जगतः', 'एकतमा', 'अतिप्राचीना', 'समृद्धा', 'शास्त्रीया', 'च', 'भाषासु', 'वर्तते', '।'] - elif lang == 'srp_cyrl': - assert tokens == ['Српски', 'језик', 'је', 'званичан', 'у', 'Србији', ',', 'Босни', 'и', 'Херцеговини', 'и', 'Црној', 'Гори', 'и', 'говори', 'га', 'око', '12', 'милиона', 'људи.[13', ']'] - elif lang == 'srp_latn': - assert tokens == ['Srpski', 'jezik', 'je', 'zvaničan', 'u', 'Srbiji', ',', 'Bosni', 'i', 'Hercegovini', 'i', 'Crnoj', 'Gori', 'i', 'govori', 'ga', 'oko', '12', 'miliona', 'ljudi.[13', ']'] - elif lang == 'sin': - assert tokens == ['ශ්\u200dරී', 'ලංකාවේ', 'ප්\u200dරධාන', 'ජාතිය', 'වන', 'සිංහල', 'ජනයාගේ', 'මව්', 'බස', 'සිංහල', 'වෙයි', '.'] - elif lang == 'slk': - assert tokens == ['Slovenčina', 'je', 'oficiálne', 'úradným', 'jazykom', 'Slovenska', ',', 'Vojvodiny', 'a', 'od', '1', '.', 'mája', '2004', 'jedným', 'z', 'jazykov', 'Európskej', 'únie', '.'] - elif lang == 'slv': - assert tokens == ['Slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'je', 'združeni', 'naziv', 'za', 'uradni', 'knjižni', 'jezik', 'Slovencev', 'in', 'skupno', 'ime', 'za', 'narečja', 'in', 'govore', ',', 'ki', 'jih', 'govorijo', 'ali', 'so', 'jih', 'nekoč', 'govorili', 'Slovenci', '.'] - elif lang == 'dsb': - assert tokens == ['Dolnoserbšćina', ',', 'dolnoserbska', 'rěc', '(', 'nimski', 'Niedersorbisch', 'abo', 'teke', 'Wendisch', ',', 'pólski', 'język', 'dolnołużycki', ',', 'česki', 'dolnolužická', 'srbština', ')', 'jo', 'jadna', 'z', 'dweju', 'rěcowu', 'Serbow', ',', 'kotaraž', 'se', 'wužywa', 'w', 'Dolnej', 'Łužycy', ',', 'w', 'pódpołdnjowej', 'Bramborskej', ',', 'na', 'pódzajtšu', 'Nimskej', '.'] - elif lang == 'hsb': - assert tokens == ['Hornjoserbšćina', 'je', 'zapadosłowjanska', 'rěč', ',', 'kotraž', 'so', 'w', 'Hornjej', 'Łužicy', 'wokoło', 'městow', 'Budyšin', ',', 'Kamjenc', 'a', 'Wojerecy', 'rěči', '.'] - elif lang == 'spa': - assert tokens == ['El', 'español', 'o', 'castellano', 'es', 'una', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablado', ',', 'perteneciente', 'a', 'la', 'familia', 'de', 'lenguas', 'indoeuropeas', '.'] - elif lang == 'swe': - assert tokens == ['Svenska', '(', 'svenska', '(', 'info', ')', ')', 'är', 'ett', 'östnordiskt', 'språk', 'som', 'talas', 'av', 'ungefär', 'tio', 'miljoner', 'personer', 'främst', 'i', 'Sverige', 'där', 'språket', 'har', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'det', 'ena', 'nationalspråket', 'i', 'Finland', 'och', 'som', 'enda', 'officiella', 'språk', 'på', 'Åland', '.'] - elif lang == 'tgl': - assert tokens == ['Ang', 'wikang', 'Tagalog[1', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜆᜄᜎᜓ', ')', ',', 'o', 'ang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pinakaginagamit', 'na', 'wika', 'ng', 'Pilipinas', '.'] - elif lang == 'tgk': - assert tokens == ['Забони', 'тоҷикӣ', '—', 'забоне', ',', 'ки', 'дар', 'Эрон', ':', 'форсӣ', ',', 'ва', 'дар', 'Афғонистон', 'дарӣ', 'номида', 'мешавад', ',', 'забони', 'давлатии', 'кишварҳои', 'Тоҷикистон', ',', 'Эрон', 'ва', 'Афғонистон', 'мебошад', '.'] - elif lang == 'tam': - assert tokens == ['தமிழ்', '(', 'Tamil', 'language', ')', 'தமிழர்களினதும்', 'தமிழ்', 'பேசும்', 'பலரின்', 'தாய்மொழி', 'ஆகும்', '.'] - elif lang == 'tat': - assert tokens == ['Татар', 'теле', '—', 'татарларның', 'милли', 'теле', ',', 'Татарстанның', 'дәүләт', 'теле', ',', 'таралышы', 'буенча', 'Россиядә', 'икенче', 'тел', '.'] - elif lang == 'tel': - assert tokens == ['తెలుగు', 'అనేది', 'ద్రావిడ', 'భాషల', 'కుటుంబానికి', 'చెందిన', 'భాష', '.'] - elif lang == 'tdt': - assert tokens == ['Tetun', '(', 'iha', 'portugés', ':', 'tétum', ';', 'iha', 'inglés', ':', 'Tetum', ')', 'ne', "'", 'e', 'lian', 'nasionál', 'no', 'ko-ofisiál', 'Timór', 'Lorosa', "'", 'e', 'nian', '.'] - elif lang == 'tha': - if word_tokenizer in [ - 'pythainlp_longest_matching', - 'pythainlp_max_matching_tcc' - ]: - assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย', '[', '3', '][', '4', ']'] - elif word_tokenizer == 'pythainlp_max_matching': - assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทยกลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย', '[', '3', '][', '4', ']'] - elif word_tokenizer == 'pythainlp_nercut': - assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย', '[', '3', '][', '4', ']'] - else: - tests_lang_util_skipped = True - elif lang == 'bod': - assert tokens == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'བལ་ཡུལ', '།', 'འབྲུག་', 'དང་', 'འབྲས་ལྗོངས', '།'] - elif lang == 'tir': - assert tokens == ['ትግርኛ', 'ኣብ', 'ኤርትራን', 'ኣብ', 'ሰሜናዊ', 'ኢትዮጵያን', 'ኣብ', 'ክልል', 'ትግራይ', 'ዝዝረብ', 'ሴማዊ', 'ቋንቋ', 'እዩ', '።'] - elif lang == 'tsn': - assert tokens == ['Setswana', 'ke', 'teme', 'e', 'e', 'buiwang', 'mo', 'mafatsheng', 'a', 'Aforika', 'Borwa', ',', 'Botswana', ',', 'Namibia', 'le', 'Zimbabwe', '.'] - elif lang == 'tur': - assert tokens == ['Türkçe', 'ya', 'da', 'Türk', 'dili', ',', 'Güneydoğu', 'Avrupa', 've', 'Batı', "Asya'da", 'konuşulan', ',', 'Türk', 'dilleri', 'dil', 'ailesine', 'ait', 'sondan', 'eklemeli', 'bir', 'dil.[12', ']'] - elif lang == 'ukr': - assert tokens == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичні', 'назви', '—', 'ру́ська', ',', 'руси́нська[10][11][12', ']', '[', '*', '1', ']', ')', '—', 'національна', 'мова', 'українців', '.'] - elif lang == 'urd': - assert tokens == ['اُردُو[8', ']', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے', '۔'] - elif lang == 'vie': - if word_tokenizer == 'nltk_tok_tok': - assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi', 'là', 'tiếng', 'Việt', 'Nam[', '9', ']', 'hay', 'Việt', 'ngữ', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.'] - elif word_tokenizer == 'underthesea_vie': - assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi là', 'tiếng', 'Việt Nam', '[', '9', ']', 'hay', 'Việt ngữ', 'là', 'ngôn ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn ngữ', 'chính thức', 'tại', 'Việt Nam', '.'] - else: - tests_lang_util_skipped = True - elif lang == 'yor': - assert tokens == ['Èdè', 'Yorùbá', 'Ni', 'èdè', 'tí', 'ó', 'ṣàkójọ', 'pọ̀', 'gbogbo', 'kú', 'oótu', 'o', '-', 'ò', '-', 'jíire', 'bí', ',', 'níapá', 'ìwọ̀', 'Oòrùn', 'ilẹ̀', 'Nàìjíríà', ',', 'tí', 'a', 'bá', 'wo', 'èdè', 'Yorùbá', ',', 'àwọn', 'onímọ̀', 'pín', 'èdè', 'náà', 'sábẹ́', 'ẹ̀yà', 'Kwa', 'nínú', 'ẹbí', 'èdè', 'Niger', '-', 'Congo', '.'] - else: - raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) + match lang: + case 'afr': + assert tokens == ['Afrikaans', 'is', 'tipologies', 'beskou', "'", 'n', 'Indo', '-', 'Europese', ',', 'Wes', '-', 'Germaanse', ',', 'Nederfrankiese', 'taal,[2', ']', 'wat', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', 'ontstaan', 'het', '.'] + case 'sqi': + assert tokens == ['Gjuha', 'shqipe', '(', 'ose', 'thjesht', 'shqipja', ')', 'është', 'gjuhë', 'dhe', 'degë', 'e', 'veçantë', 'e', 'familjes', 'indo', '-', 'evropiane', 'që', 'flitet', 'nga', 'rreth', '7', '-', '10', 'milionë', 'njerëz', 'në', 'botë,[1', ']', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'dhe', 'Maqedoninë', 'e', 'Veriut', ',', 'por', 'edhe', 'në', 'zona', 'të', 'tjera', 'të', 'Evropës', 'Juglindore', 'ku', 'ka', 'një', 'popullsi', 'shqiptare', ',', 'duke', 'përfshirë', 'Malin', 'e', 'Zi', 'dhe', 'Luginën', 'e', 'Preshevës', '.'] + case 'amh': + assert tokens == ['አማርኛ[1', ']', '፡', 'የኢትዮጵያ', '፡', 'መደበኛ', '፡', 'ቋንቋ', '፡', 'ነው', '።'] + case 'ara': + assert tokens == ['ٱللُّغَةُ', 'ٱلْعَرَبِيَّة', 'هي', 'أكثر', 'اللغات', 'السامية', 'تحدثًا', '،', 'وإحدى', 'أكثر', 'اللغات', 'انتشاراً', 'في', 'العالم', '،', 'يتحدثها', 'أكثر', 'من', '467', 'مليون', 'نسمة.(1', ')'] + case 'hye': + assert tokens == ['Հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն։', 'Գրաբարով', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը։'] + case 'asm': + assert tokens == ['অসমীয়া', 'ভাষা', 'হৈছে', 'সকলোতকৈ', 'পূৰ্বীয়', 'ভাৰতীয়-আৰ্য', 'ভাষা', '।'] + case 'aze': + assert tokens == ['Azərbaycan', 'dili[2][3', ']', '(', 'Cənubi', 'Azərbaycanda', ':', 'Türk', 'dili[4][5', ']', ')', '—', 'Azərbaycan', 'Respublikasının', 'və', 'Rusiya', 'Federasiyası', 'Dağıstan', 'Respublikasının[6', ']', 'rəsmi', 'dövlət', 'dili', '.'] + case 'eus': + assert tokens == ['Euskara', 'Euskal', 'Herriko', 'hizkuntza', 'da.[8', ']'] + case 'ben': + match word_tokenizer: + case 'sacremoses_moses': + assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামেও', 'পরিচিত', ')', 'একটি', 'ইন্দো-আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।'] + case 'spacy_ben': + assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।'] + case _: + tests_lang_util_skipped = True + case 'bul': + assert tokens == ['Бъ̀лгарският', 'езѝк', 'е', 'индоевропейски', 'език', 'от', 'групата', 'на', 'южнославянските', 'езици', ',', 'като', 'образува', 'неговата', 'източна', 'подгрупа', '.'] + case 'cat': + assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'del', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', 'd', "'", 'algunes', 'comarques', 'i', 'localitats', 'de', 'l', "'", 'interior', ')', ',', 'les', 'Illes', 'Balears', '(', 'on', 'també', 'rep', 'el', 'nom', 'de', 'mallorquí', ',', 'menorquí', ',', 'eivissenc', 'o', 'formenterer', 'segons', 'l', "'", 'illa', ')', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', 'l', "'", 'Aragó', ')', ',', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', '(', 'a', 'l', "'", 'illa', 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[', '8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'habitat', 'per', 'pobladors', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', 'l', "'", 'Argentina', ',', 'amb', '200.000', 'parlants', ')', '.', '[', '11', ']'] + case 'zho_cn': + match word_tokenizer: + case 'pkuseg_zho': + assert tokens == ['汉语', '又', '称', '中文', '、', '华语', '[', '6', ']', '、', '唐', '话[', '7]', ',', '概指', '由', '上古', '汉语', '(', '先秦', '雅言', ')', '发展', '而', '来', '、', '书面', '使用', '汉字', '的', '分析语', ',', '为', '汉藏', '语系', '最', '大', '的', '一', '支', '语族', '。'] + case 'wordless_zho_char': + assert tokens == ['汉', '语', '又', '称', '中', '文', '、', '华', '语', '[', '6', ']', '、', '唐', '话', '[', '7', ']', ',', '概', '指', '由', '上', '古', '汉', '语', '(', '先', '秦', '雅', '言', ')', '发', '展', '而', '来', '、', '书', '面', '使', '用', '汉', '字', '的', '分', '析', '语', ',', '为', '汉', '藏', '语', '系', '最', '大', '的', '一', '支', '语', '族', '。'] + case _: + tests_lang_util_skipped = True + case 'zho_tw': + match word_tokenizer: + case 'pkuseg_zho': + assert tokens == ['漢語', '又', '稱', '中文', '、', '華', '語[', '6', ']', '、', '唐', '話[', '7]', ',', '概指', '由', '上古', '漢語', '(', '先秦', '雅言', ')', '發展', '而', '來', '、', '書面', '使用', '漢字', '的', '分析', '語', ',', '為漢', '藏語系', '最', '大', '的', '一', '支', '語族', '。'] + case 'wordless_zho_char': + assert tokens == ['漢', '語', '又', '稱', '中', '文', '、', '華', '語', '[', '6', ']', '、', '唐', '話', '[', '7', ']', ',', '概', '指', '由', '上', '古', '漢', '語', '(', '先', '秦', '雅', '言', ')', '發', '展', '而', '來', '、', '書', '面', '使', '用', '漢', '字', '的', '分', '析', '語', ',', '為', '漢', '藏', '語', '系', '最', '大', '的', '一', '支', '語', '族', '。'] + case _: + tests_lang_util_skipped = True + case 'hrv': + assert tokens == ['Hrvatski', 'jezik', '(', 'ISO', '639', '-', '3', ':', 'hrv', ')', 'skupni', 'je', 'naziv', 'za', 'nacionalni', 'standardni', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječja', 'i', 'govora', 'kojima', 'govore', 'ili', 'su', 'nekada', 'govorili', 'Hrvati', '.'] + case 'ces': + assert tokens == ['Čeština', 'neboli', 'český', 'jazyk', 'je', 'západoslovanský', 'jazyk', ',', 'nejbližší', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.'] + case 'dan': + assert tokens == ['Dansk', 'er', 'et', 'østnordisk', 'sprog', 'indenfor', 'den', 'germanske', 'gren', 'af', 'den', 'indoeuropæiske', 'sprogfamilie', '.'] + case 'nld': + assert tokens == ['Het', 'Nederlands', 'is', 'een', 'West-Germaanse', 'taal', ',', 'de', 'meest', 'gebruikte', 'taal', 'in', 'Nederland', 'en', 'België', ',', 'de', 'officiële', 'taal', 'van', 'Suriname', 'en', 'een', 'van', 'de', 'drie', 'officiële', 'talen', 'van', 'België', '.'] + case 'eng_gb' | 'eng_us' | 'other': + if word_tokenizer in ['nltk_nist', 'nltk_regex']: + assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo', '-', 'European', 'language', 'family', '.'] + elif word_tokenizer in ['nltk_nltk', 'nltk_penn_treebank', 'nltk_tok_tok', 'nltk_twitter', 'sacremoses_moses']: + assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] + else: + tests_lang_util_skipped = True + case 'est': + assert tokens == ['Eesti', 'keelel', 'on', 'kaks', 'suuremat', 'murderühma', '(', 'põhjaeesti', 'ja', 'lõunaeesti', ')', ',', 'mõnes', 'käsitluses', 'eristatakse', 'ka', 'kirderanniku', 'murdeid', 'eraldi', 'murderühmana', '.'] + case 'fin': + assert tokens == ['Suomen', 'kieli', 'eli', 'suomi', 'on', 'uralilaisten', 'kielten', 'itämerensuomalaiseen', 'ryhmään', 'kuuluva', 'kieli', ',', 'jota', 'puhuvat', 'pääosin', 'suomalaiset', '.'] + case 'fra': + assert tokens == ['Le', 'français', 'est', 'une', 'langue', 'indo-européenne', 'de', 'la', 'famille', 'des', 'langues', 'romanes', 'dont', 'les', 'locuteurs', 'sont', 'appelés', 'francophones', '.'] + case 'lug': + assert tokens == ['Luganda', '/', 'Oluganda', 'lwe', 'lulimi', 'olwogerwa', 'Abaganda', 'e', 'Yuganda', '.'] + case 'deu_at' | 'deu_de' | 'deu_ch': + assert tokens == ['Das', 'Deutsche', 'ist', 'eine', 'plurizentrische', 'Sprache', ',', 'enthält', 'also', 'mehrere', 'Standardvarietäten', 'in', 'verschiedenen', 'Regionen', '.'] + case 'grc': + assert tokens == ['ἦλθον', 'δὲ', 'οἱ', 'δύο', 'ἄγγελοι', 'εἰς', 'Σόδομα', 'ἑσπέρας', '·', 'Λὼτ', 'δὲ', 'ἐκάθητο', 'παρὰ', 'τὴν', 'πύλην', 'Σοδόμων', '.', 'ἰδὼν', 'δὲ', 'Λὼτ', 'ἐξανέστη', 'εἰς', 'συνάντησιν', 'αὐτοῖς', 'καὶ', 'προσεκύνησεν', 'τῷ', 'προσώπῳ', 'ἐπὶ', 'τὴν', 'γῆν'] + case 'ell': + assert tokens == ['Η', 'ελληνική', 'γλώσσα', 'ανήκει', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια', '[', '9', ']', 'και', 'αποτελεί', 'το', 'μοναδικό', 'μέλος', 'του', 'ελληνικού', 'κλάδου', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδας', 'και', 'της', 'Κύπρου', '.'] + case 'guj': + match word_tokenizer: + case 'sacremoses_moses': + assert tokens == ['ગુજરાતી', '\u200d', '(', '/', 'ɡʊdʒəˈrɑːti', '/', '[', '૭', ']', ',', 'રોમન', 'લિપિમાં', ':', 'Gujarātī', ',', 'ઉચ્ચાર', ':', '[', 'ɡudʒəˈɾɑːtiː', ']', ')', 'ભારત', 'દેશના', 'ગુજરાત', 'રાજ્યની', 'ઇન્ડો-આર્યન', 'ભાષા', 'છે', ',', 'અને', 'મુખ્યત્વે', 'ગુજરાતી', 'લોકો', 'દ્વારા', 'બોલાય', 'છે', '.'] + case 'spacy_guj': + assert tokens == ['ગુજરાતી', '\u200d(/ɡʊdʒəˈrɑːti/[૭', ']', ',', 'રોમન', 'લિપિમાં', ':', 'Gujarātī', ',', 'ઉચ્ચાર', ':', '[', 'ɡudʒəˈɾɑːtiː', ']', ')', 'ભારત', 'દેશના', 'ગુજરાત', 'રાજ્યની', 'ઇન્ડો-આર્યન', 'ભાષા', 'છે', ',', 'અને', 'મુખ્યત્વે', 'ગુજરાતી', 'લોકો', 'દ્વારા', 'બોલાય', 'છે.'] + case _: + tests_lang_util_skipped = True + case 'heb': + assert tokens == ['עִבְרִית', 'היא', 'שפה', 'שמית', ',', 'ממשפחת', 'השפות', 'האפרו', '-', 'אסיאתיות', ',', 'הידועה', 'כשפתם', 'של', 'היהודים', 'ושל', 'השומרונים', '.'] + case 'hin': + assert tokens == ['हिन्दी', 'जिसके', 'मानकीकृत', 'रूप', 'को', 'मानक', 'हिन्दी', 'कहा', 'जाता', 'है', ',', 'विश्व', 'की', 'एक', 'प्रमुख', 'भाषा', 'है', 'और', 'भारत', 'की', 'एक', 'राजभाषा', 'है', '।'] + case 'hun': + assert tokens == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tagja', ',', 'a', 'finnugor', 'nyelvek', 'közé', 'tartozó', 'ugor', 'nyelvek', 'egyike', '.'] + case 'isl': + match word_tokenizer: + case 'sacremoses_moses': + assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga', '.', '[', '5', ']'] + case 'spacy_isl': + assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga.[5', ']'] + case _: + tests_lang_util_skipped = True + case 'ind': + assert tokens == ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'nasional', 'dan', 'resmi', 'di', 'seluruh', 'wilayah', 'Indonesia', '.'] + case 'gle': + assert tokens == ['Is', 'ceann', 'de', 'na', 'teangacha', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'de', 'na', 'trí', 'cinn', 'de', 'theangacha', 'Ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'Gaelacha', '(', 'Gaeilge', ',', 'Gaeilge', 'Mhanann', 'agus', 'Gaeilge', 'na', 'hAlban', ')', 'go', 'háirithe', '.'] + case 'ita': + assert tokens == ["L'", 'italiano', '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascoltaⓘ', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.'] + case 'jpn': + match word_tokenizer: + case 'sudachipy_jpn_split_mode_a': + assert tokens == ['日本', '語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注釈', '2', ']', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だっ', 'た', '国', '、', 'そして', '国外', '移民', 'や', '移住', '者', 'を', '含む', '日本', '人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', '。'] + case 'sudachipy_jpn_split_mode_b' | 'sudachipy_jpn_split_mode_c': + assert tokens == ['日本語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注釈', '2', ']', ')', 'は', '、', '日本', '国', '内', 'や', '、', 'かつて', 'の', '日本', '領', 'だっ', 'た', '国', '、', 'そして', '国外', '移民', 'や', '移住者', 'を', '含む', '日本人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', '。'] + case 'wordless_jpn_kanji': + assert tokens == ['日', '本', '語', '(', 'にほん', 'ご', '、', 'にっぽん', 'ご', '[', '注', '釈', '2', ']', ')', 'は', '、', '日', '本', '国', '内', 'や', '、', 'かつて', 'の', '日', '本', '領', 'だっ', 'た', '国', '、', 'そして', '国', '外', '移', '民', 'や', '移', '住', '者', 'を', '含', 'む', '日', '本', '人', '同', '士', 'の', '間', 'で', '使', '用', 'さ', 'れ', 'て', 'いる', '言', '語', '。'] + case _: + tests_lang_util_skipped = True + case 'kan': + assert tokens == ['ದ್ರಾವಿಡ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಪ್ರಾಮುಖ್ಯವುಳ್ಳ', 'ಭಾಷೆಯೂ', 'ಭಾರತದ', 'ಪುರಾತನವಾದ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಒಂದೂ', 'ಆಗಿರುವ', 'ಕನ್ನಡ', 'ಭಾಷೆಯನ್ನು', 'ಅದರ', 'ವಿವಿಧ', 'ರೂಪಗಳಲ್ಲಿ', 'ಸುಮಾರು', '೪೫', 'ದಶಲಕ್ಷ', 'ಜನರು', 'ಆಡು', 'ನುಡಿಯಾಗಿ', 'ಬಳಸುತ್ತಲಿದ್ದಾರೆ', '.'] + case 'khm': + assert tokens == ['ភាសា', 'ខ្មែរ', 'គឺជា', 'ភាសា', 'កំណើត', 'របស់', 'ជនជាតិ', 'ខ្មែរ', 'និង', 'ជា', 'ភាសា', 'ផ្លូវការ', 'របស់', 'ប្រទេស', 'កម្ពុជា', '។'] + case 'kor': + assert tokens == ['세계', '여러', '지역', '에', '한', '민족', '인구', '가', '거주', '하', '게', '되', '면서', '전', '세계', '각지', '에서', '한국어', '가', '사용', '되', '고', '있', '다', '.'] + case 'kir': + assert tokens == ['Кыргыз', 'тили', '—', 'Кыргыз', 'Республикасынын', 'мамлекеттик', 'тили', ',', 'түрк', 'тилдеринин', 'курамына', ',', 'анын', 'ичинде', 'кыргыз-кыпчак', 'же', 'тоо-алтай', 'тобуна', 'кирет', '.'] + case 'lao': + assert tokens == ['ພາສາລາວ', '(', 'Lao', ':', 'ລາວ', ',', '[', 'l', 'áː', 'w', ']', 'ຫຼື', 'ພາສາລາວ', ',', '[', 'p', 'ʰáː', 's', 'ǎː', 'l', 'áː', 'w', '])', 'ເປັນ', 'ພາສາ', 'ຕະກູນ', 'ໄທ', '-', 'ກະໄດ', 'ຂອງ', 'ຄົນ', 'ລາວ', 'ໂດຍ', 'ມີ', 'ຄົນ', 'ເວົ້າ', 'ໃນປະເທດລາວ', 'ເຊິ່ງ', 'ເປັນ', 'ພາສາ', 'ລັດຖະການ', 'ຂອງ', 'ສາທາລະນະລັດ', 'ປະຊາທິປະໄຕ', 'ປະຊາຊົນ', 'ລາວ', 'ຂອງ', 'ປະຊາກອນ', 'ປະມານ', '7', 'ລ້ານ', 'ຄົນ', 'ແລະ', 'ໃນ', 'ພື້ນທີ່', 'ພາກ', 'ຕາເວັນອອກສຽງ', 'ເໜືອ', 'ຂອງ', 'ປະເທດໄທ', 'ທີ່ມີ', 'ຄົນ', 'ເວົ້າ', 'ປະມານ', '23', 'ລ້ານ', 'ຄົນ', 'ທາງ', 'ລັດຖະບານ', 'ປະເທດໄທ', 'ມີການສະໜັບສະໜຸນ', 'ໃຫ້', 'ເອີ້ນ', 'ພາສາລາວ', 'ຖິ່ນ', 'ໄທ', 'ວ່າ', 'ພາສາລາວ', 'ຖິ່ນ', 'ອີສານ', 'ນອກຈາກ', 'ນີ້', ',', 'ຢູ່', 'ທາງ', 'ພາກ', 'ຕາເວັນອອກສຽງ', 'ເໜືອ', 'ຂອງ', 'ປະເທດກຳປູເຈຍ', 'ກໍ', 'ມີ', 'ຄົນ', 'ເວົ້າ', 'ພາສາລາວ', 'ຄືກັນ', '.'] + case 'lat': + assert tokens == ['Lingua', 'Latina,[1', ']', 'sive', 'sermo', 'Latinus,[2', ']', 'est', 'lingua', 'Indoeuropaea', 'qua', 'primum', 'Latini', 'universi', 'et', 'Romani', 'antiqui', 'in', 'primis', 'loquebantur', 'quamobrem', 'interdum', 'etiam', 'lingua', 'Latia[3', ']', '(', 'in', 'Latio', 'enim', 'sueta', ')', 'et', 'lingua', 'Romana[4', ']', '(', 'nam', 'imperii', 'Romani', 'sermo', 'sollemnis', ')', 'appellatur', '.'] + case 'lav': + match word_tokenizer: + case 'sacremoses_moses': + assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,5', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda', '.', '[', '1', ']', '[', '3', ']'] + case 'spacy_lav': + assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,5', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda.[1][3', ']'] + case _: + tests_lang_util_skipped = True + case 'lij': + assert tokens == ['O', 'baxin', "d'", 'influensa', 'di', 'dialetti', 'lìguri', 'o', "l'", 'é', 'de', 'çirca', '2', 'milioìn', 'de', 'personn', '-', 'e', 'anche', 'se', ',', 'specialmente', 'inti', 'ùrtimi', "çinquant'", 'anni', ',', 'pe', 'coscì', 'de', 'variante', 'locali', 'se', 'son', 'pèrse', 'e', 'de', 'âtre', 'son', 'a', 'reizego', "tutt'", 'òua', ',', 'anche', 'pe', 'córpa', 'da', 'mancansa', 'de', "'", 'n', 'pâ', 'de', 'generaçioin', 'inta', 'continoasion', 'da', 'parlâ', '.'] + case 'lit': + assert tokens == ['Lietuvių', 'kalba', '–', 'iš', 'baltų', 'prokalbės', 'kilusi', 'lietuvių', 'tautos', 'kalba', ',', 'kuri', 'Lietuvoje', 'yra', 'valstybinė', ',', 'o', 'Europos', 'Sąjungoje', '–', 'viena', 'iš', 'oficialiųjų', 'kalbų', '.'] + case 'ltz': + assert tokens == ["D'", 'Lëtzebuergesch', 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.'] + case 'mkd': + assert tokens == ['Македонски', 'јазик', '—', 'јужнословенски', 'јазик', ',', 'дел', 'од', 'групата', 'на', 'словенски', 'јазици', 'од', 'јазичното', 'семејство', 'на', 'индоевропски', 'јазици', '.'] + case 'msa': + assert tokens == ['Bahasa', 'Melayu', '(', 'Tulisan', 'Jawi', ':', 'بهاس', 'ملايو', ';', 'Rencong', ':', 'ꤷꥁꤼ', 'ꤸꥍꤾꤿꥈ', ')', 'ialah', 'salah', 'satu', 'daripada', 'bahasa', '-', 'bahasa', 'Melayu', '-', 'Polinesia', 'di', 'bawah', 'keluarga', 'bahasa', 'Austronesia', ',', 'yang', 'merupakan', 'bahasa', 'rasmi', 'di', 'Brunei', ',', 'Indonesia', ',', 'Malaysia', 'dan', 'Singapura', ',', 'serta', 'dituturkan', 'di', 'Timor', 'Leste', 'dan', 'sebahagian', 'wilayah', 'di', 'Kemboja', ',', 'Filipina', 'dan', 'Thailand', '.'] + case 'mal': + match word_tokenizer: + case 'sacremoses_moses': + assert tokens == ['ഇന്ത്യയിൽ', 'കേരള', 'സംസ്ഥാനത്തിലും', 'കേന്ദ്രഭരണപ്രദേശങ്ങളായ', 'ലക്ഷദ്വീപിലും', 'പോണ്ടിച്ചേരിയുടെ', 'ഭാഗമായ', 'മാഹിയിലും', 'തമിഴ്നാട്ടിലെ', 'കന്യാകുമാരി', 'ജില്ലയിലും', 'നീലഗിരി', 'ജില്ലയിലെ', 'ഗൂഡല്ലൂർ', 'താലൂക്കിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം', '.'] + case 'spacy_mal': + assert tokens == ['ഇന്ത്യയിൽ', 'കേരള', 'സംസ്ഥാനത്തിലും', 'കേന്ദ്രഭരണപ്രദേശങ്ങളായ', 'ലക്ഷദ്വീപിലും', 'പോണ്ടിച്ചേരിയുടെ', 'ഭാഗമായ', 'മാഹിയിലും', 'തമിഴ്നാട്ടിലെ', 'കന്യാകുമാരി', 'ജില്ലയിലും', 'നീലഗിരി', 'ജില്ലയിലെ', 'ഗൂഡല്ലൂർ', 'താലൂക്കിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം.'] + case _: + tests_lang_util_skipped = True + case 'mar': + match word_tokenizer: + case 'sacremoses_moses': + assert tokens == ['मराठी', 'भाषा', 'ही', 'इंडो-युरोपीय', 'भाषाकुळातील', 'एक', 'भाषा', 'आहे', '.'] + case 'spacy_mar': + assert tokens == ['मराठी', 'भाषा', 'ही', 'इंडो', '-', 'युरोपीय', 'भाषाकुळातील', 'एक', 'भाषा', 'आहे', '.'] + case _: + tests_lang_util_skipped = True + case 'mni_mtei': + assert tokens == ['ꯃꯤꯇꯩꯂꯣꯟ', '(', 'ꯃꯤꯇꯩꯂꯣꯜ', ')', 'ꯅꯠꯇ', '꯭', 'ꯔꯒ', 'ꯃꯩꯇꯩꯂꯣꯟ', '(', 'ꯃꯩꯇꯩꯂꯣꯜ', ')', 'ꯅꯠꯇ', '꯭', 'ꯔꯒ', 'ꯃꯅꯤꯄꯨꯔꯤ', 'ꯂꯣꯟ', '(', 'ꯃꯅꯤꯄꯨꯔꯤ', 'ꯂꯣꯜ', ')', 'ꯑꯁꯤ', 'ꯑꯋꯥꯡ-ꯅꯣꯡꯄꯣꯛ', 'ꯏꯟꯗꯤꯌꯥꯒꯤ', 'ꯃꯅꯤꯄꯨꯔꯗ', 'ꯃꯄꯨꯡ', 'ꯑꯣꯢꯅ', 'ꯉꯥꯡꯅꯕ', 'ꯂꯣꯟ', 'ꯑꯃꯅꯤ', '꯫'] + case 'nep': + assert tokens == ['नेपाली', 'भाषा', '(', 'अन्तर्राष्ट्रिय', 'ध्वन्यात्मक', 'वर्णमाला', '[', 'neˈpali', 'bʱaʂa', ']', ')', 'नेपालको', 'सम्पर्क', 'भाषा', 'तथा', 'भारत', ',', 'भुटान', 'र', 'म्यानमारको', 'केही', 'भागमा', 'मातृभाषाको', 'रूपमा', 'बोलिने', 'भाषा', 'हो', '।'] + case 'nob': + assert tokens == ['Bokmål', 'er', 'en', 'varietet', 'av', 'norsk', 'skriftspråk', '.'] + case 'ori': + assert tokens == ['ଓଡ଼ିଆ', '(', 'ଇଂରାଜୀ', 'ଭାଷାରେ', 'Odia', '/', 'əˈdiːə', '/', 'or', 'Oriya', '/', 'ɒˈriːə', '/', ',', ')', 'ଏକ', 'ଭାରତୀୟ', 'ଭାଷା', 'ଯାହା', 'ଏକ', 'ଇଣ୍ଡୋ-ଇଉରୋପୀୟ', 'ଭାଷାଗୋଷ୍ଠୀ', 'ଅନ୍ତର୍ଗତ', 'ଇଣ୍ଡୋ-ଆର୍ଯ୍ୟ', 'ଭାଷା', '।'] + case 'fas': + assert tokens == ['فارسی', 'یا', 'پارسی', 'یک', 'زبان', 'ایرانی', 'غربی', 'از', 'زیرگروه', 'ایرانی', 'شاخهٔ', 'هندوایرانیِ', 'خانوادهٔ', 'زبان\u200cهای', 'هندواروپایی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', 'تاجیکستان', '،', 'ازبکستان', '،', 'پاکستان', '،', 'عراق', '،', 'ترکمنستان', 'و', 'آذربایجان', 'به', 'آن', 'سخن', 'می\u200cگویند', '.'] + case 'pol': + assert tokens == ['Język', 'polski', ',', 'polszczyzna', '–', 'język', 'z', 'grupy', 'zachodniosłowiańskiej', '(', 'do', 'której', 'należą', 'również', 'czeski', ',', 'kaszubski', ',', 'słowacki', 'i', 'języki', 'łużyckie', ')', ',', 'stanowiącej', 'część', 'rodziny', 'indoeuropejskiej', '.'] + case 'por_br' | 'por_pt': + assert tokens == ['A', 'língua', 'portuguesa', ',', 'também', 'designada', 'português', ',', 'é', 'uma', 'língua', 'indo-europeia', 'românica', 'flexiva', 'ocidental', 'originada', 'no', 'galego-português', 'falado', 'no', 'Reino', 'da', 'Galiza', 'e', 'no', 'norte', 'de', 'Portugal', '.'] + case 'pan_guru': + assert tokens == ['ਪੰਜਾਬੀ', 'ਭਾਸ਼ਾ', '(', 'ਸ਼ਾਹਮੁਖੀ', ':', '\u200e', 'پنجابی', ',', 'ਪੰਜਾਬੀ', ')', 'ਪੰਜਾਬ', 'ਦੀ', 'ਭਾਸ਼ਾ', 'ਹੈ', ',', 'ਜਿਸ', 'ਨੂੰ', 'ਪੰਜਾਬ', 'ਖੇਤਰ', 'ਦੇ', 'ਵਸਨੀਕ', 'ਜਾਂ', 'ਸੰਬੰਧਿਤ', 'ਲੋਕ', 'ਬੋਲਦੇ', 'ਹਨ', '।', '[', '18', ']'] + case 'ron': + assert tokens == ['Limba', 'română', 'este', 'o', 'limbă', 'indo-europeană', 'din', 'grupul', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbilor', 'romanice', '.'] + case 'rus': + match word_tokenizer: + case 'nltk_tok_tok': + assert tokens == ['Ру́сский', 'язы́к', '(', 'МФА', ':', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянской', 'группы', 'славянской', 'ветви', 'индоевропейской', 'языковой', 'семьи', ',', 'национальный', 'язык', 'русского', 'народа', '.'] + case 'sacremoses_moses': + assert tokens == ['Ру', '́', 'сский', 'язы', '́', 'к', '(', 'МФА', ':', '[', 'ˈruskʲɪi', '̯', 'jɪˈzɨk', ']', 'ⓘ', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'язык', 'восточнославянской', 'группы', 'славянской', 'ветви', 'индоевропейской', 'языковой', 'семьи', ',', 'национальный', 'язык', 'русского', 'народа', '.'] + case _: + tests_lang_util_skipped = True + case 'san': + assert tokens == ['संस्कृतम्', 'जगतः', 'एकतमा', 'अतिप्राचीना', 'समृद्धा', 'शास्त्रीया', 'च', 'भाषासु', 'वर्तते', '।'] + case 'srp_cyrl': + assert tokens == ['Српски', 'језик', 'је', 'званичан', 'у', 'Србији', ',', 'Босни', 'и', 'Херцеговини', 'и', 'Црној', 'Гори', 'и', 'говори', 'га', 'око', '12', 'милиона', 'људи.[13', ']'] + case 'srp_latn': + assert tokens == ['Srpski', 'jezik', 'je', 'zvaničan', 'u', 'Srbiji', ',', 'Bosni', 'i', 'Hercegovini', 'i', 'Crnoj', 'Gori', 'i', 'govori', 'ga', 'oko', '12', 'miliona', 'ljudi.[13', ']'] + case 'sin': + assert tokens == ['ශ්\u200dරී', 'ලංකාවේ', 'ප්\u200dරධාන', 'ජාතිය', 'වන', 'සිංහල', 'ජනයාගේ', 'මව්', 'බස', 'සිංහල', 'වෙයි', '.'] + case 'slk': + assert tokens == ['Slovenčina', 'je', 'oficiálne', 'úradným', 'jazykom', 'Slovenska', ',', 'Vojvodiny', 'a', 'od', '1', '.', 'mája', '2004', 'jedným', 'z', 'jazykov', 'Európskej', 'únie', '.'] + case 'slv': + assert tokens == ['Slovenščina', '[', 'sloˈʋenʃtʃina', ']', 'je', 'združeni', 'naziv', 'za', 'uradni', 'knjižni', 'jezik', 'Slovencev', 'in', 'skupno', 'ime', 'za', 'narečja', 'in', 'govore', ',', 'ki', 'jih', 'govorijo', 'ali', 'so', 'jih', 'nekoč', 'govorili', 'Slovenci', '.'] + case 'dsb': + assert tokens == ['Dolnoserbšćina', ',', 'dolnoserbska', 'rěc', '(', 'nimski', 'Niedersorbisch', 'abo', 'teke', 'Wendisch', ',', 'pólski', 'język', 'dolnołużycki', ',', 'česki', 'dolnolužická', 'srbština', ')', 'jo', 'jadna', 'z', 'dweju', 'rěcowu', 'Serbow', ',', 'kotaraž', 'se', 'wužywa', 'w', 'Dolnej', 'Łužycy', ',', 'w', 'pódpołdnjowej', 'Bramborskej', ',', 'na', 'pódzajtšu', 'Nimskej', '.'] + case 'hsb': + assert tokens == ['Hornjoserbšćina', 'je', 'zapadosłowjanska', 'rěč', ',', 'kotraž', 'so', 'w', 'Hornjej', 'Łužicy', 'wokoło', 'městow', 'Budyšin', ',', 'Kamjenc', 'a', 'Wojerecy', 'rěči', '.'] + case 'spa': + assert tokens == ['El', 'español', 'o', 'castellano', 'es', 'una', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablado', ',', 'perteneciente', 'a', 'la', 'familia', 'de', 'lenguas', 'indoeuropeas', '.'] + case 'swe': + assert tokens == ['Svenska', '(', 'svenska', '(', 'info', ')', ')', 'är', 'ett', 'östnordiskt', 'språk', 'som', 'talas', 'av', 'ungefär', 'tio', 'miljoner', 'personer', 'främst', 'i', 'Sverige', 'där', 'språket', 'har', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'det', 'ena', 'nationalspråket', 'i', 'Finland', 'och', 'som', 'enda', 'officiella', 'språk', 'på', 'Åland', '.'] + case 'tgl': + assert tokens == ['Ang', 'wikang', 'Tagalog[1', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜆᜄᜎᜓ', ')', ',', 'o', 'ang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pinakaginagamit', 'na', 'wika', 'ng', 'Pilipinas', '.'] + case 'tgk': + assert tokens == ['Забони', 'тоҷикӣ', '—', 'забоне', ',', 'ки', 'дар', 'Эрон', ':', 'форсӣ', ',', 'ва', 'дар', 'Афғонистон', 'дарӣ', 'номида', 'мешавад', ',', 'забони', 'давлатии', 'кишварҳои', 'Тоҷикистон', ',', 'Эрон', 'ва', 'Афғонистон', 'мебошад', '.'] + case 'tam': + assert tokens == ['தமிழ்', '(', 'Tamil', 'language', ')', 'தமிழர்களினதும்', 'தமிழ்', 'பேசும்', 'பலரின்', 'தாய்மொழி', 'ஆகும்', '.'] + case 'tat': + assert tokens == ['Татар', 'теле', '—', 'татарларның', 'милли', 'теле', ',', 'Татарстанның', 'дәүләт', 'теле', ',', 'таралышы', 'буенча', 'Россиядә', 'икенче', 'тел', '.'] + case 'tel': + assert tokens == ['తెలుగు', 'అనేది', 'ద్రావిడ', 'భాషల', 'కుటుంబానికి', 'చెందిన', 'భాష', '.'] + case 'tdt': + assert tokens == ['Tetun', '(', 'iha', 'portugés', ':', 'tétum', ';', 'iha', 'inglés', ':', 'Tetum', ')', 'ne', "'", 'e', 'lian', 'nasionál', 'no', 'ko-ofisiál', 'Timór', 'Lorosa', "'", 'e', 'nian', '.'] + case 'tha': + match word_tokenizer: + case 'pythainlp_longest_matching' | 'pythainlp_max_matching_tcc': + assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย', '[', '3', '][', '4', ']'] + case 'pythainlp_max_matching': + assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทยกลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย', '[', '3', '][', '4', ']'] + case 'pythainlp_nercut': + assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษา', 'ใน', 'กลุ่ม', 'ภาษา', 'ไท', 'ซึ่ง', 'เป็น', 'กลุ่มย่อย', 'ของ', 'ตระกูล', 'ภาษา', 'ข', 'ร้า', '-', 'ไท', 'และ', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย', '[', '3', '][', '4', ']'] + case _: + tests_lang_util_skipped = True + case 'bod': + assert tokens == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'བལ་ཡུལ', '།', 'འབྲུག་', 'དང་', 'འབྲས་ལྗོངས', '།'] + case 'tir': + assert tokens == ['ትግርኛ', 'ኣብ', 'ኤርትራን', 'ኣብ', 'ሰሜናዊ', 'ኢትዮጵያን', 'ኣብ', 'ክልል', 'ትግራይ', 'ዝዝረብ', 'ሴማዊ', 'ቋንቋ', 'እዩ', '።'] + case 'tsn': + assert tokens == ['Setswana', 'ke', 'teme', 'e', 'e', 'buiwang', 'mo', 'mafatsheng', 'a', 'Aforika', 'Borwa', ',', 'Botswana', ',', 'Namibia', 'le', 'Zimbabwe', '.'] + case 'tur': + assert tokens == ['Türkçe', 'ya', 'da', 'Türk', 'dili', ',', 'Güneydoğu', 'Avrupa', 've', 'Batı', "Asya'da", 'konuşulan', ',', 'Türk', 'dilleri', 'dil', 'ailesine', 'ait', 'sondan', 'eklemeli', 'bir', 'dil.[12', ']'] + case 'ukr': + assert tokens == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичні', 'назви', '—', 'ру́ська', ',', 'руси́нська[10][11][12', ']', '[', '*', '1', ']', ')', '—', 'національна', 'мова', 'українців', '.'] + case 'urd': + assert tokens == ['اُردُو[8', ']', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے', '۔'] + case 'vie': + match word_tokenizer: + case 'nltk_tok_tok': + assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi', 'là', 'tiếng', 'Việt', 'Nam[', '9', ']', 'hay', 'Việt', 'ngữ', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.'] + case 'underthesea_vie': + assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi là', 'tiếng', 'Việt Nam', '[', '9', ']', 'hay', 'Việt ngữ', 'là', 'ngôn ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn ngữ', 'chính thức', 'tại', 'Việt Nam', '.'] + case _: + tests_lang_util_skipped = True + case 'yor': + assert tokens == ['Èdè', 'Yorùbá', 'Ni', 'èdè', 'tí', 'ó', 'ṣàkójọ', 'pọ̀', 'gbogbo', 'kú', 'oótu', 'o', '-', 'ò', '-', 'jíire', 'bí', ',', 'níapá', 'ìwọ̀', 'Oòrùn', 'ilẹ̀', 'Nàìjíríà', ',', 'tí', 'a', 'bá', 'wo', 'èdè', 'Yorùbá', ',', 'àwọn', 'onímọ̀', 'pín', 'èdè', 'náà', 'sábẹ́', 'ẹ̀yà', 'Kwa', 'nínú', 'ẹbí', 'èdè', 'Niger', '-', 'Congo', '.'] + case _: + raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) if tests_lang_util_skipped: raise wl_test_init.Wl_Exception_Tests_Lang_Util_Skipped(word_tokenizer) @@ -334,10 +342,11 @@ def test_char_tokenizers(): ('zho_cn', 'jpn'), ('wordless_zho_char', 'wordless_jpn_kanji') ): - if lang == 'zho_cn': - sentence = '英国全称是United Kingdom of Great Britain,由四个部分组成:England、Scotland、Wales和Northern Ireland' - elif lang == 'jpn': - sentence = '''The meaning of "天気がいいから、散歩しましょう。" is: The weather is good so let's take a walk.''' + match lang: + case 'zho_cn': + sentence = '英国全称是United Kingdom of Great Britain,由四个部分组成:England、Scotland、Wales和Northern Ireland' + case 'jpn': + sentence = '''The meaning of "天気がいいから、散歩しましょう。" is: The weather is good so let's take a walk.''' tokens = wl_word_tokenization.wl_word_tokenize_flat( main, @@ -345,14 +354,16 @@ def test_char_tokenizers(): lang = lang, word_tokenizer = char_tokenizer ) + tokens = wl_texts.to_display_texts(tokens) print(f'{lang} / {char_tokenizer}:') print(f'{tokens}\n') - if lang == 'zho_cn': - assert tokens == ['英', '国', '全', '称', '是', 'United', 'Kingdom', 'of', 'Great', 'Britain', ',', '由', '四', '个', '部', '分', '组', '成', ':', 'England', '、', 'Scotland', '、', 'Wales', '和', 'Northern', 'Ireland'] - elif lang == 'jpn': - assert tokens == ['The', 'meaning', 'of', '``', '天', '気', 'が', 'いい', 'から', '、', '散', '歩', 'し', 'ましょう', '。', '``', 'is', ':', 'The', 'weather', 'is', 'good', 'so', 'let', "'s", 'take', 'a', 'walk', '.'] + match lang: + case 'zho_cn': + assert tokens == ['英', '国', '全', '称', '是', 'United', 'Kingdom', 'of', 'Great', 'Britain', ',', '由', '四', '个', '部', '分', '组', '成', ':', 'England', '、', 'Scotland', '、', 'Wales', '和', 'Northern', 'Ireland'] + case 'jpn': + assert tokens == ['The', 'meaning', 'of', '``', '天', '気', 'が', 'いい', 'から', '、', '散', '歩', 'し', 'ましょう', '。', '``', 'is', ':', 'The', 'weather', 'is', 'good', 'so', 'let', "'s", 'take', 'a', 'walk', '.'] if __name__ == '__main__': for lang, word_tokenizer in test_word_tokenizers_local: diff --git a/tests/tests_nlp/tests_spacy/test_spacy.py b/tests/tests_nlp/tests_spacy/test_spacy.py index 05f61c507..1104f7757 100644 --- a/tests/tests_nlp/tests_spacy/test_spacy.py +++ b/tests/tests_nlp/tests_spacy/test_spacy.py @@ -17,10 +17,8 @@ # ---------------------------------------------------------------------- from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import ( - wl_dependency_parsing, wl_lemmatization, wl_nlp_utils, wl_pos_tagging, wl_sentence_tokenization, - wl_word_tokenization -) +from tests.tests_nlp import test_dependency_parsing, test_lemmatization, test_pos_tagging +from wordless.wl_nlp import wl_nlp_utils, wl_sentence_tokenization, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_conversion main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'spacy') @@ -36,20 +34,22 @@ def wl_test_spacy( lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang) wl_nlp_utils.check_models(main, langs = [lang], lang_utils = [[f'spacy_{lang_no_suffix}']]) + test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') + wl_test_sentence_tokenize(lang, results_sentence_tokenize_trf, results_sentence_tokenize_lg) - wl_test_word_tokenize(lang, results_word_tokenize) + wl_test_word_tokenize(lang, test_sentence, results_word_tokenize) # Tokenized tokens = wl_word_tokenization.wl_word_tokenize_flat( main, - text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), + text = test_sentence, lang = lang ) if lang != 'other': - wl_test_pos_tag(lang, tokens, results_pos_tag, results_pos_tag_universal) - wl_test_lemmatize(lang, tokens, results_lemmatize) - wl_test_dependency_parse(lang, tokens, results_dependency_parse) + wl_test_pos_tag(lang, test_sentence, tokens, results_pos_tag, results_pos_tag_universal) + wl_test_lemmatize(lang, test_sentence, tokens, results_lemmatize) + wl_test_dependency_parse(lang, test_sentence, tokens, results_dependency_parse) def wl_test_sentence_tokenize(lang, results_trf, results_lg): lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang) @@ -94,9 +94,8 @@ def wl_test_sentence_tokenize(lang, results_trf, results_lg): assert sentences_lg == results_lg -def wl_test_word_tokenize(lang, results): +def wl_test_word_tokenize(lang, test_sentence, results): lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang) - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') word_tokenizer = f'spacy_{lang_no_suffix}' tokens = wl_word_tokenization.wl_word_tokenize_flat( @@ -114,189 +113,22 @@ def wl_test_word_tokenize(lang, results): # The count of tokens should be more than the length of tokens split by space assert len(tokens) > len(test_sentence.split()) - assert tokens == results + assert wl_texts.to_display_texts(tokens) == results -def wl_test_pos_tag(lang, tokens, results, results_universal): +def wl_test_pos_tag(lang, test_sentence, tokens, results, results_universal): lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang) - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') pos_tagger = f'spacy_{lang_no_suffix}' - # Untokenized - tokens_tagged = wl_pos_tagging.wl_pos_tag( - main, - inputs = test_sentence, - lang = lang, - pos_tagger = pos_tagger - ) - tokens_tagged_universal = wl_pos_tagging.wl_pos_tag( - main, - inputs = test_sentence, - lang = lang, - pos_tagger = pos_tagger, - tagset = 'universal' - ) - - # Tokenized - tokens_tagged_tokenized = wl_pos_tagging.wl_pos_tag( - main, - inputs = tokens, - lang = lang, - pos_tagger = pos_tagger - ) - tokens_tagged_universal_tokenized = wl_pos_tagging.wl_pos_tag( - main, - inputs = tokens, - lang = lang, - pos_tagger = pos_tagger, - tagset = 'universal' - ) - - print(f'{lang} / {pos_tagger}:') - print(tokens_tagged) - print(f'{tokens_tagged_universal}\n') - - # Check for empty tags - assert tokens_tagged == results - assert tokens_tagged_universal == results_universal - assert tokens_tagged_tokenized - assert tokens_tagged_universal_tokenized - assert all((tag for token, tag in tokens_tagged)) - assert all((tag for token, tag in tokens_tagged_universal)) - assert all((tag for token, tag in tokens_tagged_tokenized)) - assert all((tag for token, tag in tokens_tagged_universal_tokenized)) - # Universal tags should not all be "X" - assert any((tag for token, tag in tokens_tagged_universal if tag != 'X')) - assert any((tag for token, tag in tokens_tagged_universal_tokenized if tag != 'X')) - - # Tokenization should not be modified - assert len(tokens) == len(tokens_tagged_tokenized) == len(tokens_tagged_universal_tokenized) - - # Long texts - tokens_tagged_tokenized_long = wl_pos_tagging.wl_pos_tag( - main, - inputs = [str(i) for i in range(101) for j in range(10)], - lang = lang, - pos_tagger = pos_tagger - ) - - assert [token[0] for token in tokens_tagged_tokenized_long] == [str(i) for i in range(101) for j in range(10)] + test_pos_tagging.wl_test_pos_tag_models(lang, pos_tagger, test_sentence, tokens, results, results_universal) -def wl_test_lemmatize(lang, tokens, results): +def wl_test_lemmatize(lang, test_sentence, tokens, results): lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang) - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') lemmatizer = f'spacy_{lang_no_suffix}' - # Untokenized - lemmas = wl_lemmatization.wl_lemmatize( - main, - inputs = test_sentence, - lang = lang, - lemmatizer = lemmatizer - ) - - # Tokenized - lemmas_tokenized = wl_lemmatization.wl_lemmatize( - main, - inputs = tokens, - lang = lang, - lemmatizer = lemmatizer - ) - - print(f'{lang} / {lemmatizer}:') - print(f'{lemmas}\n') - - # Check for empty lemmas - assert lemmas == results - assert lemmas_tokenized - assert all(lemmas) - assert all(lemmas_tokenized) + test_lemmatization.wl_test_lemmatize_models(lang, lemmatizer, test_sentence, tokens, results) - # Tokenization should not be modified - assert len(tokens) == len(lemmas_tokenized) - - # Tagged texts - main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - - lemmas_tokenized_tagged = wl_lemmatization.wl_lemmatize( - main, - inputs = [token + '_TEST' for token in tokens], - lang = lang, - lemmatizer = lemmatizer, - tagged = True - ) - - assert lemmas_tokenized_tagged == [lemma + '_TEST' for lemma in lemmas_tokenized] - - # Long texts - lemmas_tokenized_long = wl_lemmatization.wl_lemmatize( - main, - inputs = [str(i) for i in range(101) for j in range(10)], - lang = lang, - lemmatizer = lemmatizer - ) - - assert lemmas_tokenized_long == [str(i) for i in range(101) for j in range(10)] - -def wl_test_dependency_parse(lang, tokens, results): +def wl_test_dependency_parse(lang, test_sentence, tokens, results): lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang) - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') dependency_parser = f'spacy_{lang_no_suffix}' - # Untokenized - dependencies = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = test_sentence, - lang = lang, - dependency_parser = dependency_parser - ) - - # Tokenized - dependencies_tokenized = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = tokens, - lang = lang, - dependency_parser = dependency_parser - ) - - print(f'{lang} / {dependency_parser}:') - print(f'{dependencies}\n') - - # Check for empty dependencies - assert dependencies == results - assert dependencies_tokenized - assert all(dependencies) - assert all(dependencies_tokenized) - - for dependency in dependencies + dependencies_tokenized: - assert len(dependency) == 4 - - # Tokenization should not be modified - assert len(tokens) == len(dependencies_tokenized) - - # Tagged texts - main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - - dependencies_tokenized_tagged = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = [token + '_TEST' for token in tokens], - lang = lang, - dependency_parser = dependency_parser, - tagged = True - ) - - dependencies_tokenized = [ - (child + '_TEST', head + '_TEST', dependency_relation, dependency_dist) - for child, head, dependency_relation, dependency_dist in dependencies_tokenized - ] - - assert dependencies_tokenized_tagged == dependencies_tokenized - - # Long texts - dependencies_tokenized_long = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = [str(i) for i in range(101) for j in range(10)], - lang = lang, - dependency_parser = dependency_parser - ) - - assert [dependency[0] for dependency in dependencies_tokenized_long] == [str(i) for i in range(101) for j in range(10)] + test_dependency_parsing.wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, tokens, results) diff --git a/tests/tests_nlp/tests_stanza/test_stanza.py b/tests/tests_nlp/tests_stanza/test_stanza.py index 64c6d12ed..6d437bd82 100644 --- a/tests/tests_nlp/tests_stanza/test_stanza.py +++ b/tests/tests_nlp/tests_stanza/test_stanza.py @@ -17,10 +17,8 @@ # ---------------------------------------------------------------------- from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import ( - wl_dependency_parsing, wl_lemmatization, wl_nlp_utils, wl_pos_tagging, wl_sentence_tokenization, - wl_sentiment_analysis, wl_word_tokenization -) +from tests.tests_nlp import test_dependency_parsing, test_lemmatization, test_pos_tagging, test_sentiment_analysis +from wordless.wl_nlp import wl_nlp_utils, wl_sentence_tokenization, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_conversion main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'stanza') @@ -36,28 +34,30 @@ def wl_test_stanza( ): wl_nlp_utils.check_models(main, langs = [lang], lang_utils = [[wl_test_get_lang_util(main, lang)]]) + test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') + if lang in wl_nlp_utils.get_langs_stanza(main, util_type = 'word_tokenizers'): wl_test_sentence_tokenize(lang, results_sentence_tokenize) - wl_test_word_tokenize(lang, results_word_tokenize) + wl_test_word_tokenize(lang, test_sentence, results_word_tokenize) # Tokenized tokens = wl_word_tokenization.wl_word_tokenize_flat( main, - text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), + text = test_sentence, lang = lang ) if lang in wl_nlp_utils.get_langs_stanza(main, util_type = 'pos_taggers'): - wl_test_pos_tag(lang, tokens, results_pos_tag, results_pos_tag_universal) + wl_test_pos_tag(lang, test_sentence, tokens, results_pos_tag, results_pos_tag_universal) if lang in wl_nlp_utils.get_langs_stanza(main, util_type = 'lemmatizers'): - wl_test_lemmatize(lang, tokens, results_lemmatize) + wl_test_lemmatize(lang, test_sentence, tokens, results_lemmatize) if lang in wl_nlp_utils.get_langs_stanza(main, util_type = 'dependency_parsers'): - wl_test_dependency_parse(lang, tokens, results_dependency_parse) + wl_test_dependency_parse(lang, test_sentence, tokens, results_dependency_parse) if lang in wl_nlp_utils.get_langs_stanza(main, util_type = 'sentiment_analyzers'): - wl_test_sentiment_analyze(lang, tokens, results_sentiment_analayze) + wl_test_sentiment_analyze(lang, test_sentence, tokens, results_sentiment_analayze) def wl_test_get_lang_util(main, lang): if lang in ['zho_cn', 'zho_tw', 'srp_latn']: @@ -89,8 +89,7 @@ def wl_test_sentence_tokenize(lang, results): assert sentences == results -def wl_test_word_tokenize(lang, results): - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') +def wl_test_word_tokenize(lang, test_sentence, results): word_tokenizer = wl_test_get_lang_util(main, lang) tokens = wl_word_tokenization.wl_word_tokenize_flat( @@ -113,235 +112,30 @@ def wl_test_word_tokenize(lang, results): else: assert len(tokens) > len(test_sentence.split()) - assert tokens == results + assert wl_texts.to_display_texts(tokens) == results -def wl_test_pos_tag(lang, tokens, results, results_universal): - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') +def wl_test_pos_tag(lang, test_sentence, tokens, results, results_universal): pos_tagger = wl_test_get_lang_util(main, lang) - # Untokenized - tokens_tagged = wl_pos_tagging.wl_pos_tag( - main, - inputs = test_sentence, - lang = lang, - pos_tagger = pos_tagger - ) - tokens_tagged_universal = wl_pos_tagging.wl_pos_tag( - main, - inputs = test_sentence, - lang = lang, - pos_tagger = pos_tagger, - tagset = 'universal' - ) - - # Tokenized - tokens_tagged_tokenized = wl_pos_tagging.wl_pos_tag( - main, - inputs = tokens, - lang = lang, - pos_tagger = pos_tagger - ) - tokens_tagged_universal_tokenized = wl_pos_tagging.wl_pos_tag( - main, - inputs = tokens, - lang = lang, - pos_tagger = pos_tagger, - tagset = 'universal' - ) - - print(f'{lang} / {pos_tagger}:') - print(tokens_tagged) - print(f'{tokens_tagged_universal}\n') - - # Check for empty tags - assert tokens_tagged == results - assert tokens_tagged_universal == results_universal - assert tokens_tagged_tokenized - assert tokens_tagged_universal_tokenized - assert all((tag for token, tag in tokens_tagged)) - assert all((tag for token, tag in tokens_tagged_universal)) - assert all((tag for token, tag in tokens_tagged_tokenized)) - assert all((tag for token, tag in tokens_tagged_universal_tokenized)) - # Universal tags should not all be "X" - assert any((tag for token, tag in tokens_tagged_universal if tag != 'X')) - assert any((tag for token, tag in tokens_tagged_universal_tokenized if tag != 'X')) + test_pos_tagging.wl_test_pos_tag_models(lang, pos_tagger, test_sentence, tokens, results, results_universal) - # Tokenization should not be modified - assert len(tokens) == len(tokens_tagged_tokenized) == len(tokens_tagged_universal_tokenized) - - # Long texts - tokens_tagged_tokenized_long = wl_pos_tagging.wl_pos_tag( - main, - inputs = [str(i) for i in range(101) for j in range(10)], - lang = lang, - pos_tagger = pos_tagger - ) - - assert [token[0] for token in tokens_tagged_tokenized_long] == [str(i) for i in range(101) for j in range(10)] - -def wl_test_lemmatize(lang, tokens, results): - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') +def wl_test_lemmatize(lang, test_sentence, tokens, results): lemmatizer = wl_test_get_lang_util(main, lang) - # Untokenized - lemmas = wl_lemmatization.wl_lemmatize( - main, - inputs = test_sentence, - lang = lang, - lemmatizer = lemmatizer - ) - - # Tokenized - lemmas_tokenized = wl_lemmatization.wl_lemmatize( - main, - inputs = tokens, - lang = lang, - lemmatizer = lemmatizer + test_lemmatization.wl_test_lemmatize_models( + lang, lemmatizer, test_sentence, tokens, results, + lang_exceptions = [ + 'bul', 'chu', 'cop', 'est', 'got', 'grc', 'ell', 'hin', 'isl', 'lij', + 'lit', 'glv', 'pcm', 'pol', 'orv', 'sme', 'san', 'tur', 'cym' + ] ) - print(f'{lang} / {lemmatizer}:') - print(f'{lemmas}\n') - - # Check for empty lemmas - assert lemmas == results - assert lemmas_tokenized - assert all(lemmas) - assert all(lemmas_tokenized) - - # Tokenization should not be modified - assert len(tokens) == len(lemmas_tokenized) - - # Tagged texts - main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - - lemmas_tokenized_tagged = wl_lemmatization.wl_lemmatize( - main, - inputs = [token + '_TEST' for token in tokens], - lang = lang, - lemmatizer = lemmatizer, - tagged = True - ) - - assert lemmas_tokenized_tagged == [lemma + '_TEST' for lemma in lemmas_tokenized] - - # Long texts - lemmas_tokenized_long = wl_lemmatization.wl_lemmatize( - main, - inputs = [str(i) for i in range(101) for j in range(10)], - lang = lang, - lemmatizer = lemmatizer - ) - - if lang in [ - 'bul', 'chu', 'cop', 'est', 'got', 'grc', 'ell', 'hin', 'isl', 'lij', - 'lit', 'glv', 'pcm', 'pol', 'orv', 'sme', 'san', 'tur', 'cym' - ]: - assert len(lemmas_tokenized_long) == 101 * 10 - else: - assert lemmas_tokenized_long == [str(i) for i in range(101) for j in range(10)] - -def wl_test_dependency_parse(lang, tokens, results): - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') +def wl_test_dependency_parse(lang, test_sentence, tokens, results): dependency_parser = wl_test_get_lang_util(main, lang) - # Untokenized - dependencies = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = test_sentence, - lang = lang, - dependency_parser = dependency_parser - ) - - # Tokenized - dependencies_tokenized = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = tokens, - lang = lang, - dependency_parser = dependency_parser - ) - - print(f'{lang} / {dependency_parser}:') - print(f'{dependencies}\n') - - # Check for empty dependencies - assert dependencies == results - assert dependencies_tokenized - assert all(dependencies) - assert all(dependencies_tokenized) - - for dependency in dependencies + dependencies_tokenized: - assert len(dependency) == 4 - - # Tokenization should not be modified - assert len(tokens) == len(dependencies_tokenized) - - # Tagged texts - main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - - dependencies_tokenized_tagged = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = [token + '_TEST' for token in tokens], - lang = lang, - dependency_parser = dependency_parser, - tagged = True - ) - - dependencies_tokenized = [ - (child + '_TEST', head + '_TEST', dependency_relation, dependency_dist) - for child, head, dependency_relation, dependency_dist in dependencies_tokenized - ] - - assert dependencies_tokenized_tagged == dependencies_tokenized + test_dependency_parsing.wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, tokens, results) - # Long texts - dependencies_tokenized_long = wl_dependency_parsing.wl_dependency_parse( - main, - inputs = [str(i) for i in range(101) for j in range(10)], - lang = lang, - dependency_parser = dependency_parser - ) - - assert [dependency[0] for dependency in dependencies_tokenized_long] == [str(i) for i in range(101) for j in range(10)] - -def wl_test_sentiment_analyze(lang, tokens, results): - test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}') +def wl_test_sentiment_analyze(lang, test_sentence, tokens, results): sentiment_analyzer = wl_test_get_lang_util(main, lang) - # Untokenized - sentiment_scores = wl_sentiment_analysis.wl_sentiment_analyze( - main, - inputs = [test_sentence], - lang = lang, - sentiment_analyzer = sentiment_analyzer - ) - - # Tokenized - sentiment_scores_tokenized = wl_sentiment_analysis.wl_sentiment_analyze( - main, - inputs = [tokens], - lang = lang, - sentiment_analyzer = sentiment_analyzer - ) - - print(f'{lang} / {sentiment_analyzer}:') - print(f'{sentiment_scores}\n') - - # Check for empty results - assert sentiment_scores == results - assert sentiment_scores_tokenized == results - - for sentiment_score in sentiment_scores + sentiment_scores_tokenized: - assert -1 <= sentiment_score <= 1 - - # Tagged texts - main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']] - - sentiment_scores_tokenized_tagged = wl_sentiment_analysis.wl_sentiment_analyze( - main, - inputs = [[token + '_TEST' for token in tokens]], - lang = lang, - sentiment_analyzer = sentiment_analyzer, - tagged = True - ) - - assert sentiment_scores_tokenized_tagged == sentiment_scores_tokenized + test_sentiment_analysis.wl_test_sentiment_analyze_models(lang, sentiment_analyzer, test_sentence, tokens, results) diff --git a/tests/tests_settings/test_settings_global.py b/tests/tests_settings/test_settings_global.py index 31663076d..aa14c42c3 100644 --- a/tests/tests_settings/test_settings_global.py +++ b/tests/tests_settings/test_settings_global.py @@ -78,6 +78,7 @@ def __init__(self): self.invalid_lang_utils = False self.lang_default_missing = False self.lang_default_extra = False + self.invalid_default_lang_util = False def check_missing_extra_langs(self, langs_supported, langs_global, msg): for lang_code in langs_supported: diff --git a/tests/wl_test_file_area.py b/tests/wl_test_file_area.py index 4f43d01a4..1b5e06bfa 100644 --- a/tests/wl_test_file_area.py +++ b/tests/wl_test_file_area.py @@ -118,7 +118,7 @@ def update_gui_ref(err_msg, new_files): assert new_file['path_original'] == wl_paths.get_normalized_path(file_path) - if i < NUM_FILES_ALL: + if i < NUM_FILES_ALL or new_file['name'] == '[eng_gb] Tagged': assert new_file['encoding'] == 'utf_8' else: assert new_file['encoding'] == 'ascii' @@ -127,6 +127,9 @@ def update_gui_ref(err_msg, new_files): assert not new_file['tokenized'] assert not new_file['tagged'] + if new_file['name'] == '[eng_gb] Tagged': + new_file['tagged'] = True + print(f'done! (In {round(time.time() - time_start, 2)} seconds)') # Save Settings diff --git a/tests/wl_test_lang_examples.py b/tests/wl_test_lang_examples.py index ed07ee16b..6ac37e6a2 100644 --- a/tests/wl_test_lang_examples.py +++ b/tests/wl_test_lang_examples.py @@ -469,6 +469,8 @@ SENTENCE_ZUL = 'Zulu /ˈzuːluː/, noma isiZulu wulimi lwabantu base Ningizimu neAfrika abayingxenye yamaNguni.' SENTENCE_OTHER = SENTENCE_ENG_US +TOKENS_LONG = [str(i) for i in range(101) for j in range(10)] + def check_lang_examples(main): settings_langs = settings_langs = [lang[0] for lang in main.settings_global['langs'].values()] diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py index 6d4086620..d556a4adf 100644 --- a/wordless/wl_colligation_extractor.py +++ b/wordless/wl_colligation_extractor.py @@ -723,9 +723,12 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats self.set_item_num(i, 0, -1) # Node - self.model().setItem(i, 1, wl_tables.Wl_Table_Item(' '.join(node))) + self.model().setItem(i, 1, wl_tables.Wl_Table_Item(' '.join(wl_texts.to_display_texts(node)))) + self.model().item(i, 1).tokens_filter = node + # Collocate - self.model().setItem(i, 2, wl_tables.Wl_Table_Item(collocate)) + self.model().setItem(i, 2, wl_tables.Wl_Table_Item(collocate.display_text())) + self.model().item(i, 2).tokens_filter = [collocate] # Frequency for j, freqs_file in enumerate(freqs_files): @@ -820,74 +823,45 @@ def update_gui_fig(self, err_msg, colligations_freqs_file, colligations_stats_fi else: span_position = span_positions.index(int(settings['fig_settings']['use_data'][1:])) - # Network Graph - if settings['fig_settings']['graph_type'] == self.tr('Network Graph'): - collocates_freq_files = { - (' '.join(node), collocate): numpy.array(freqs)[:, span_position] - for (node, collocate), freqs in colligations_freqs_file.items() - } - # Line Chart & Word Cloud - else: - collocates_freq_files = { - ', '.join([' '.join(node), collocate]): numpy.array(freqs)[:, span_position] - for (node, collocate), freqs in colligations_freqs_file.items() - } + collocates_freq_files = { + colligation: numpy.array(freqs)[:, span_position] + for colligation, freqs in colligations_freqs_file.items() + } wl_figs_freqs.wl_fig_freqs( self.main, collocates_freq_files, tab = 'colligation_extractor' ) elif settings['fig_settings']['use_data'] == self.tr('Frequency'): - # Network Graph - if settings['fig_settings']['graph_type'] == self.tr('Network graph'): - collocates_freq_files = { - (' '.join(node), collocate): numpy.array(freqs).sum(axis = 1) - for (node, collocate), freqs in colligations_freqs_file.items() - } - # Line Chart & Word Cloud - else: - collocates_freq_files = { - ', '.join([' '.join(node), collocate]): numpy.array(freqs).sum(axis = 1) - for (node, collocate), freqs in colligations_freqs_file.items() - } + collocates_freq_files = { + colligation: numpy.array(freqs).sum(axis = 1) + for colligation, freqs in colligations_freqs_file.items() + } wl_figs_freqs.wl_fig_freqs( self.main, collocates_freq_files, tab = 'colligation_extractor' ) else: - # Network Graph - if settings['fig_settings']['graph_type'] == self.tr('Network graph'): - colligations_stats_files = { - (' '.join(node), collocate): freqs - for (node, collocate), freqs in colligations_stats_files.items() - } - # Line Chart & Word Cloud - else: - colligations_stats_files = { - ', '.join([' '.join(node), collocate]): freqs - for (node, collocate), freqs in colligations_stats_files.items() - } - if settings['fig_settings']['use_data'] == col_text_test_stat: collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 0] - for collocate, stats_files in colligations_stats_files.items() + colligation: numpy.array(stats_files)[:, 0] + for colligation, stats_files in colligations_stats_files.items() } elif settings['fig_settings']['use_data'] == self.tr('p-value'): collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 1] - for collocate, stats_files in colligations_stats_files.items() + colligation: numpy.array(stats_files)[:, 1] + for colligation, stats_files in colligations_stats_files.items() } elif settings['fig_settings']['use_data'] == self.tr('Bayes factor'): collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 2] - for collocate, stats_files in colligations_stats_files.items() + colligation: numpy.array(stats_files)[:, 2] + for colligation, stats_files in colligations_stats_files.items() } elif settings['fig_settings']['use_data'] == col_text_effect_size: collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 3] - for collocate, stats_files in colligations_stats_files.items() + colligation: numpy.array(stats_files)[:, 3] + for colligation, stats_files in colligations_stats_files.items() } wl_figs_stats.wl_fig_stats( @@ -935,10 +909,8 @@ def run(self): colligations_freqs_file = {} colligations_freqs_file_all = {} - text = copy.deepcopy(file['text']) - - text = wl_token_processing.wl_process_tokens_colligation_extractor( - self.main, text, + text = wl_token_processing.wl_process_tokens( + self.main, file['text'], token_settings = settings['token_settings'] ) @@ -952,7 +924,6 @@ def run(self): search_terms = wl_matching.match_search_terms_ngrams( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], search_settings = settings['search_settings'] ) @@ -963,7 +934,6 @@ def run(self): ) = wl_matching.match_search_terms_context( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], context_settings = settings['context_settings'] ) @@ -978,7 +948,6 @@ def run(self): len_paras = len(offsets_paras) len_sentences = len(offsets_sentences) len_sentence_segs = len(offsets_sentence_segs) - len_tokens = len(tokens) settings_limit_searching = settings['generation_settings']['limit_searching'] @@ -1001,27 +970,29 @@ def run(self): i_unit = bisect.bisect(offsets_unit, i) - 1 i_unit_start = offsets_unit[i_unit] - i_unit_end = offsets_unit[i_unit + 1] - 1 if i_unit < len_unit - 1 else len_tokens - 1 + i_unit_end = offsets_unit[i_unit + 1] - 1 if i_unit < len_unit - 1 else text.num_tokens - 1 # Extract collocates tags_left = [] tags_right = [] + tags = wl_texts.to_tokens(wl_texts.get_token_properties(tokens, 'tag'), lang = file['lang']) + if window_left < 0 < window_right: # Limit Searching if settings_limit_searching == _tr('wl_colligation_extractor', 'None'): - tags_left = text.tags[max(0, i + window_left) : i] - tags_right = text.tags[i + ngram_size : i + ngram_size + window_right] + tags_left = tags[max(0, i + window_left) : i] + tags_right = tags[i + ngram_size : i + ngram_size + window_right] else: # Span positions (Left) for position in range(max(0, i + window_left), i): if i_unit_start <= position <= i_unit_end: - tags_left.append(text.tags[position]) + tags_left.append(tags[position]) # Span positions (Right) for position in range(i + ngram_size, i + ngram_size + window_right): if i_unit_start <= position <= i_unit_end: - tags_right.append(text.tags[position]) + tags_right.append(tags[position]) for j, collocate in enumerate(reversed(tags_left)): if wl_matching.check_context( @@ -1053,12 +1024,12 @@ def run(self): elif window_left < 0 and window_right < 0: # Limit Searching if settings_limit_searching == _tr('wl_colligation_extractor', 'None'): - tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)] + tags_left = tags[max(0, i + window_left) : max(0, i + window_right + 1)] else: # Span positions (Left) for position in range(max(0, i + window_left), max(0, i + window_right + 1)): if i_unit_start <= position <= i_unit_end: - tags_left.append(text.tags[position]) + tags_left.append(tags[position]) for j, collocate in enumerate(reversed(tags_left)): if wl_matching.check_context( @@ -1076,12 +1047,12 @@ def run(self): elif window_left > 0 and window_right > 0: # Limit Searching if settings_limit_searching == _tr('wl_colligation_extractor', 'None'): - tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right] + tags_right = tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right] else: # Span positions (Right) for position in range(i + ngram_size + window_left - 1, i + ngram_size + window_right): if i_unit_start <= position <= i_unit_end: - tags_right.append(text.tags[position]) + tags_right.append(tags[position]) for j, collocate in enumerate(tags_right): if wl_matching.check_context( @@ -1126,6 +1097,8 @@ def run(self): colligations_freqs_total = {} colligations_freqs_total_all = {} + texts.append(wl_texts.Wl_Text_Blank()) + # Frequency for colligations_freqs_file in self.colligations_freqs_files: for colligation, freqs in colligations_freqs_file.items(): @@ -1133,6 +1106,7 @@ def run(self): colligations_freqs_total[colligation] = freqs else: colligations_freqs_total[colligation] = list(map(operator.add, colligations_freqs_total[colligation], freqs)) + # Frequency (All) for colligations_freqs_file_all in colligations_freqs_files_all: for ngram_size, colligations_freqs in colligations_freqs_file_all.items(): @@ -1144,8 +1118,6 @@ def run(self): self.colligations_freqs_files.append(colligations_freqs_total) colligations_freqs_files_all.append(colligations_freqs_total_all) - texts.append(wl_texts.Wl_Text_Blank()) - test_statistical_significance = settings['generation_settings']['test_statistical_significance'] measure_bayes_factor = settings['generation_settings']['measure_bayes_factor'] measure_effect_size = settings['generation_settings']['measure_effect_size'] diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py index 3edca65ad..5eeb9ea27 100644 --- a/wordless/wl_collocation_extractor.py +++ b/wordless/wl_collocation_extractor.py @@ -724,9 +724,12 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats self.set_item_num(i, 0, -1) # Node - self.model().setItem(i, 1, wl_tables.Wl_Table_Item(' '.join(node))) + self.model().setItem(i, 1, wl_tables.Wl_Table_Item(' '.join(wl_texts.to_display_texts(node)))) + self.model().item(i, 1).tokens_filter = node + # Collocate - self.model().setItem(i, 2, wl_tables.Wl_Table_Item(collocate)) + self.model().setItem(i, 2, wl_tables.Wl_Table_Item(collocate.display_text())) + self.model().item(i, 2).tokens_filter = [collocate] # Frequency for j, freqs_file in enumerate(freqs_files): @@ -817,78 +820,49 @@ def update_gui_fig(self, err_msg, collocations_freqs_files, collocations_stats_f else: span_position = span_positions.index(int(settings['fig_settings']['use_data'][1:])) - # Network Graph - if settings['fig_settings']['graph_type'] == self.tr('Network graph'): - collocates_freq_files = { - (' '.join(node), collocate): numpy.array(freqs)[:, span_position] - for (node, collocate), freqs in collocations_freqs_files.items() - } - # Line Chart & Word Cloud - else: - collocates_freq_files = { - ', '.join([' '.join(node), collocate]): numpy.array(freqs)[:, span_position] - for (node, collocate), freqs in collocations_freqs_files.items() - } + collocations_freq_files = { + collocation: numpy.array(freqs)[:, span_position] + for collocation, freqs in collocations_freqs_files.items() + } wl_figs_freqs.wl_fig_freqs( - self.main, collocates_freq_files, + self.main, collocations_freq_files, tab = 'collocation_extractor' ) elif settings['fig_settings']['use_data'] == self.tr('Frequency'): - # Network Graph - if settings['fig_settings']['graph_type'] == self.tr('Network graph'): - collocates_freq_files = { - (' '.join(node), collocate): numpy.array(freqs).sum(axis = 1) - for (node, collocate), freqs in collocations_freqs_files.items() - } - # Line Chart & Word Cloud - else: - collocates_freq_files = { - ', '.join([' '.join(node), collocate]): numpy.array(freqs).sum(axis = 1) - for (node, collocate), freqs in collocations_freqs_files.items() - } + collocations_freq_files = { + collocation: numpy.array(freqs).sum(axis = 1) + for collocation, freqs in collocations_freqs_files.items() + } wl_figs_freqs.wl_fig_freqs( - self.main, collocates_freq_files, + self.main, collocations_freq_files, tab = 'collocation_extractor' ) else: - # Network Graph - if settings['fig_settings']['graph_type'] == self.tr('Network graph'): - collocations_stats_files = { - (' '.join(node), collocate): freqs - for (node, collocate), freqs in collocations_stats_files.items() - } - # Line Chart & Word Cloud - else: - collocations_stats_files = { - ', '.join([' '.join(node), collocate]): freqs - for (node, collocate), freqs in collocations_stats_files.items() - } - if settings['fig_settings']['use_data'] == col_text_test_stat: - collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 0] - for collocate, stats_files in collocations_stats_files.items() + collocations_stat_files = { + collocation: numpy.array(stats_files)[:, 0] + for collocation, stats_files in collocations_stats_files.items() } elif settings['fig_settings']['use_data'] == self.tr('p-value'): - collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 1] - for collocate, stats_files in collocations_stats_files.items() + collocations_stat_files = { + collocation: numpy.array(stats_files)[:, 1] + for collocation, stats_files in collocations_stats_files.items() } elif settings['fig_settings']['use_data'] == self.tr('Bayes factor'): - collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 2] - for collocate, stats_files in collocations_stats_files.items() + collocations_stat_files = { + collocation: numpy.array(stats_files)[:, 2] + for collocation, stats_files in collocations_stats_files.items() } elif settings['fig_settings']['use_data'] == col_text_effect_size: - collocates_stat_files = { - collocate: numpy.array(stats_files)[:, 3] - for collocate, stats_files in collocations_stats_files.items() + collocations_stat_files = { + collocation: numpy.array(stats_files)[:, 3] + for collocation, stats_files in collocations_stats_files.items() } wl_figs_stats.wl_fig_stats( - self.main, collocates_stat_files, + self.main, collocations_stat_files, tab = 'collocation_extractor' ) @@ -932,9 +906,8 @@ def run(self): collocations_freqs_file = {} collocations_freqs_file_all = {} - text = copy.deepcopy(file['text']) text = wl_token_processing.wl_process_tokens( - self.main, text, + self.main, file['text'], token_settings = settings['token_settings'] ) @@ -948,7 +921,6 @@ def run(self): search_terms = wl_matching.match_search_terms_ngrams( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], search_settings = settings['search_settings'] ) @@ -959,7 +931,6 @@ def run(self): ) = wl_matching.match_search_terms_context( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], context_settings = settings['context_settings'] ) @@ -974,7 +945,6 @@ def run(self): len_paras = len(offsets_paras) len_sentences = len(offsets_sentences) len_sentence_segs = len(offsets_sentence_segs) - len_tokens = len(tokens) settings_limit_searching = settings['generation_settings']['limit_searching'] @@ -997,7 +967,7 @@ def run(self): i_unit = bisect.bisect(offsets_unit, i) - 1 i_unit_start = offsets_unit[i_unit] - i_unit_end = offsets_unit[i_unit + 1] - 1 if i_unit < len_unit - 1 else len_tokens - 1 + i_unit_end = offsets_unit[i_unit + 1] - 1 if i_unit < len_unit - 1 else text.num_tokens - 1 # Extract collocates tokens_left = [] @@ -1119,6 +1089,8 @@ def run(self): # Total if len(files) > 1: + texts.append(wl_texts.Wl_Text_Blank()) + collocations_freqs_total = {} collocations_freqs_total_all = {} @@ -1141,8 +1113,6 @@ def run(self): self.collocations_freqs_files.append(collocations_freqs_total) collocations_freqs_files_all.append(collocations_freqs_total_all) - texts.append(wl_texts.Wl_Text_Blank()) - test_statistical_significance = settings['generation_settings']['test_statistical_significance'] measure_bayes_factor = settings['generation_settings']['measure_bayes_factor'] measure_effect_size = settings['generation_settings']['measure_effect_size'] diff --git a/wordless/wl_concordancer.py b/wordless/wl_concordancer.py index db3bd0b5f..653c84ece 100644 --- a/wordless/wl_concordancer.py +++ b/wordless/wl_concordancer.py @@ -31,7 +31,7 @@ from wordless.wl_checks import wl_checks_work_area from wordless.wl_dialogs import wl_dialogs_misc from wordless.wl_figs import wl_figs -from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_token_processing, wl_sentiment_analysis +from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_texts, wl_token_processing, wl_sentiment_analysis from wordless.wl_utils import wl_misc, wl_threading from wordless.wl_widgets import wl_boxes, wl_labels, wl_layouts, wl_tables, wl_widgets @@ -507,9 +507,9 @@ def update_gui_table(self, err_msg, concordance_lines): node_color = self.main.settings_custom['tables']['concordancer']['sorting_settings']['highlight_colors']['lvl_1'] for i, concordance_line in enumerate(concordance_lines): - left_text, left_text_raw, left_text_search = concordance_line[0] - node_text, node_text_raw, node_text_search = concordance_line[1] - right_text, right_text_raw, right_text_search = concordance_line[2] + left_tokens_raw, left_tokens_search = concordance_line[0] + node_tokens_raw, node_tokens_search = concordance_line[1] + right_tokens_raw, right_tokens_search = concordance_line[2] sentiment = concordance_line[3] no_token, len_tokens = concordance_line[4] @@ -522,7 +522,7 @@ def update_gui_table(self, err_msg, concordance_lines): label_node = wl_labels.Wl_Label_Html( f''' -  {node_text}  +  {' '.join(node_tokens_raw)}  ''', self.main @@ -532,28 +532,28 @@ def update_gui_table(self, err_msg, concordance_lines): self.indexWidget(self.model().index(i, 1)).setAlignment(Qt.AlignHCenter | Qt.AlignVCenter) - self.indexWidget(self.model().index(i, 1)).text_raw = node_text_raw - self.indexWidget(self.model().index(i, 1)).text_search = node_text_search + self.indexWidget(self.model().index(i, 1)).tokens_raw = node_tokens_raw + self.indexWidget(self.model().index(i, 1)).tokens_search = node_tokens_search # Left self.setIndexWidget( self.model().index(i, 0), - wl_labels.Wl_Label_Html(left_text, self.main) + wl_labels.Wl_Label_Html(' '.join(left_tokens_raw), self.main) ) self.indexWidget(self.model().index(i, 0)).setAlignment(Qt.AlignRight | Qt.AlignVCenter) - self.indexWidget(self.model().index(i, 0)).text_raw = left_text_raw - self.indexWidget(self.model().index(i, 0)).text_search = left_text_search + self.indexWidget(self.model().index(i, 0)).tokens_raw = left_tokens_raw + self.indexWidget(self.model().index(i, 0)).tokens_search = left_tokens_search # Right self.setIndexWidget( self.model().index(i, 2), - wl_labels.Wl_Label_Html(right_text, self.main) + wl_labels.Wl_Label_Html(' '.join(right_tokens_raw), self.main) ) - self.indexWidget(self.model().index(i, 2)).text_raw = right_text_raw - self.indexWidget(self.model().index(i, 2)).text_search = right_text_search + self.indexWidget(self.model().index(i, 2)).tokens_raw = right_tokens_raw + self.indexWidget(self.model().index(i, 2)).tokens_search = right_tokens_search # Sentiment if not isinstance(sentiment, str): @@ -669,9 +669,8 @@ def run(self): for file in self.main.wl_file_area.get_selected_files(): concordance_lines_file = [] - text = copy.deepcopy(file['text']) text = wl_token_processing.wl_process_tokens_concordancer( - self.main, text, + self.main, file['text'], token_settings = settings['token_settings'] ) @@ -685,7 +684,6 @@ def run(self): search_terms = wl_matching.match_search_terms_ngrams( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], search_settings = settings['search_settings'] ) @@ -696,7 +694,6 @@ def run(self): ) = wl_matching.match_search_terms_context( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], context_settings = settings['context_settings'] ) @@ -711,7 +708,6 @@ def run(self): len_paras = len(offsets_paras) len_sentences = len(offsets_sentences) len_sentence_segs = len(offsets_sentence_segs) - len_tokens = len(tokens) sentiment_inputs = [] @@ -734,104 +730,59 @@ def run(self): no_para = bisect.bisect(offsets_paras, i) # Search in Results (Node) - text_search_node = list(ngram) - - if not settings['token_settings']['punc_marks']: - ngram = text.tokens_flat_punc_marks_merged[i : i + len_search_term] - - node_text = ' '.join(ngram) - node_text = wl_nlp_utils.escape_text(node_text) + node_tokens_search = list(ngram) + node_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(ngram)) # Width Unit if settings['generation_settings']['width_unit'] == self.tr('Character'): len_context_left = 0 len_context_right = 0 - context_left = [] - context_right = [] + left_tokens_raw = [] + right_tokens_raw = [] width_left_char = settings['generation_settings']['width_left_char'] width_right_char = settings['generation_settings']['width_right_char'] while len_context_left < width_left_char: - if i - 1 - len(context_left) < 0: + if i - 1 - len(left_tokens_raw) < 0: break else: - token_next = tokens[i - 1 - len(context_left)] + token_next = tokens[i - 1 - len(left_tokens_raw)] len_token_next = len(token_next) if len_context_left + len_token_next > width_left_char: - context_left.insert(0, token_next[-(width_left_char - len_context_left):]) + left_tokens_raw.insert(0, wl_texts.set_token_text( + token_next, + token_next[-(width_left_char - len_context_left):] + )) else: - context_left.insert(0, token_next) + left_tokens_raw.insert(0, token_next) len_context_left += len_token_next while len_context_right < width_right_char: - if i + len_search_term + len(context_right) > len(text.tokens_flat_punc_marks_merged) - 1: + if i + len_search_term + len(right_tokens_raw) > text.num_tokens - 1: break else: - token_next = tokens[i + len_search_term + len(context_right)] + token_next = tokens[i + len_search_term + len(right_tokens_raw)] len_token_next = len(token_next) if len_context_right + len_token_next > width_right_char: - context_right.append(token_next[: width_right_char - len_context_right]) + right_tokens_raw.append(wl_texts.set_token_text( + token_next, + token_next[: width_right_char - len_context_right] + )) else: - context_right.append(token_next) + right_tokens_raw.append(token_next) len_context_right += len_token_next - - # Search in results (Left & Right) - text_search_left = copy.deepcopy(context_left) - text_search_right = copy.deepcopy(context_right) - - if not settings['token_settings']['punc_marks']: - len_context_leftmost = len(context_left[0]) - len_context_rightmost = len(context_right[-1]) - - context_left = text.tokens_flat_punc_marks_merged[i - len(context_left): i] - context_right = text.tokens_flat_punc_marks_merged[i + len_search_term : i + len_search_term + len(context_right)] - - # Clip the leftmost and rightmost token in context - context_leftmost = '' - context_rightmost = '' - len_context_leftmost_no_puncs = 0 - len_context_rightmost_no_puncs = 0 - - for char in reversed(context_left[0]): - if len_context_leftmost_no_puncs < len_context_leftmost: - context_leftmost = char + context_leftmost - - if char.isalnum(): - len_context_leftmost_no_puncs += 1 - else: - break - - for char in reversed(context_right[-1]): - if len_context_rightmost_no_puncs < len_context_rightmost: - context_rightmost += char - - if char.isalnum(): - len_context_rightmost_no_puncs += 1 - else: - break - - context_left[0] = context_leftmost - context_right[-1] = context_rightmost elif settings['generation_settings']['width_unit'] == self.tr('Token'): width_left_token = settings['generation_settings']['width_left_token'] width_right_token = settings['generation_settings']['width_right_token'] - context_left = text.tokens_flat_punc_marks_merged[max(0, i - width_left_token) : i] - context_right = text.tokens_flat_punc_marks_merged[i + len_search_term : i + len_search_term + width_right_token] - - # Search in results (Left & Right) - if settings['token_settings']['punc_marks']: - text_search_left = copy.deepcopy(context_left) - text_search_right = copy.deepcopy(context_right) - else: - text_search_left = tokens[max(0, i - width_left_token) : i] - text_search_right = tokens[i + len_search_term : i + len_search_term + width_right_token] + left_tokens_raw = tokens[max(0, i - width_left_token) : i] + right_tokens_raw = tokens[i + len_search_term : i + len_search_term + width_right_token] else: if settings['generation_settings']['width_unit'] == self.tr('Sentence segment'): width_settings = 'sentence_seg' @@ -859,40 +810,35 @@ def run(self): else: offset_end = offsets_unit[no_unit + width_right] - context_left = text.tokens_flat_punc_marks_merged[offset_start:i] - context_right = text.tokens_flat_punc_marks_merged[i + len_search_term : offset_end] + left_tokens_raw = tokens[offset_start:i] + right_tokens_raw = tokens[i + len_search_term : offset_end] - # Search in results (Left & Right) - if settings['token_settings']['punc_marks']: - text_search_left = copy.deepcopy(context_left) - text_search_right = copy.deepcopy(context_right) - else: - text_search_left = tokens[offset_start:i] - text_search_right = tokens[i + len_search_term : offset_end] + # Search in results (Left & Right) + left_tokens_search = copy.deepcopy(left_tokens_raw) + right_tokens_search = copy.deepcopy(right_tokens_raw) # Remove empty tokens for searching in results - text_search_left = [token for token in text_search_left if token] - text_search_right = [token for token in text_search_right if token] + left_tokens_search = [token for token in left_tokens_search if token] + right_tokens_search = [token for token in right_tokens_search if token] - context_left = wl_nlp_utils.escape_tokens(context_left) - context_right = wl_nlp_utils.escape_tokens(context_right) - - context_left_text = ' '.join(context_left) - context_right_text = ' '.join(context_right) + left_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(left_tokens_raw)) + right_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(right_tokens_raw)) # Left - concordance_line.append([context_left_text, context_left, text_search_left]) + concordance_line.append([left_tokens_raw, left_tokens_search]) # Node - concordance_line.append([node_text, list(ngram), text_search_node]) + concordance_line.append([node_tokens_raw, node_tokens_search]) # Right - concordance_line.append([context_right_text, context_right, text_search_right]) + concordance_line.append([right_tokens_raw, right_tokens_search]) # Sentiment if text.lang in self.main.settings_global['sentiment_analyzers']: - sentiment_inputs.append(' '.join([context_left_text, node_text, context_right_text])) + sentiment_inputs.append(' '.join( + [*left_tokens_search, *node_tokens_search, *right_tokens_search] + )) # Token No. - concordance_line.append([i + 1, len_tokens]) + concordance_line.append([i + 1, text.num_tokens]) # Sentence Segment No. concordance_line.append([no_sentence_seg, len_sentence_segs]) # Sentence No. @@ -942,16 +888,14 @@ def run(self): files = sorted(self.main.wl_file_area.get_selected_files(), key = lambda item: item['name']) for file in files: - text = copy.deepcopy(file['text']) text = wl_token_processing.wl_process_tokens_concordancer( - self.main, text, + self.main, file['text'], token_settings = settings['token_settings'] ) search_terms_file = wl_matching.match_search_terms_ngrams( self.main, text.get_tokens_flat(), lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], search_settings = settings['search_settings'] ) @@ -960,12 +904,12 @@ def run(self): for search_term in search_terms_file: search_terms_total.add(search_term) - search_terms_labels.add(' '.join(search_term)) + search_terms_labels.add(' '.join(wl_texts.to_display_texts(search_term))) texts.append(text) len_files = len(files) - len_tokens_total = sum((len(text.get_tokens_flat()) for text in texts)) + len_tokens_total = sum((text.num_tokens for text in texts)) if settings['fig_settings']['sort_results_by'] == self.tr('File'): search_terms_total = sorted(search_terms_total) @@ -982,15 +926,14 @@ def run(self): if search_term in search_terms_files[j]: x_start_total = x_start + sum(( - len(text.get_tokens_flat()) + text.num_tokens for k, text in enumerate(texts) if k < j )) - len_tokens = len(tokens) for k, ngram in enumerate(wl_nlp_utils.ngrams(tokens, len_search_term)): if ngram == search_term: - points.append([x_start + k / len_tokens * len_tokens_total, y_start - j]) + points.append([x_start + k / text.num_tokens * len_tokens_total, y_start - j]) # Total points.append([x_start_total + k, 0]) elif settings['fig_settings']['sort_results_by'] == self.tr('Search term'): @@ -1003,7 +946,7 @@ def run(self): for j, text in enumerate(texts): if search_term in search_terms_files[j]: x_start = sum(( - len(text.get_tokens_flat()) + text.num_tokens for k, text in enumerate(texts) if k < j )) + j + 2 @@ -1017,7 +960,7 @@ def run(self): x_tick_labels = [''] if settings['fig_settings']['sort_results_by'] == self.tr('File'): - len_tokens_total = sum((len(text.get_tokens_flat()) for text in texts)) + len_tokens_total = sum((text.num_tokens for text in texts)) for i, search_term in enumerate(search_terms_total): x_tick_start = len_tokens_total * i + i + 1 @@ -1038,7 +981,6 @@ def run(self): labels.append(list(range(len(files) + 1))) labels.append([self.tr('Total')] + [file['name'] for file in reversed(files)]) labels.append(len(files) + 1) - elif settings['fig_settings']['sort_results_by'] == self.tr('Search term'): len_search_terms_total = len(search_terms_total) @@ -1046,15 +988,15 @@ def run(self): tokens = text.get_tokens_flat() x_tick_start = sum(( - len(text.get_tokens_flat()) + text.num_tokens for j, text in enumerate(texts) if j < i )) + j + 1 # 1/2 - x_ticks.append(x_tick_start + len(tokens) / 2) + x_ticks.append(x_tick_start + text.num_tokens / 2) # Divider - x_ticks.append(x_tick_start + len(tokens) + 1) + x_ticks.append(x_tick_start + text.num_tokens + 1) for file in files: # 1/2 diff --git a/wordless/wl_concordancer_parallel.py b/wordless/wl_concordancer_parallel.py index 3ca6d1f2d..0c3ef00af 100644 --- a/wordless/wl_concordancer_parallel.py +++ b/wordless/wl_concordancer_parallel.py @@ -27,7 +27,7 @@ from wordless.wl_checks import wl_checks_work_area from wordless.wl_dialogs import wl_dialogs_misc, wl_msg_boxes -from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_token_processing +from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_texts, wl_token_processing from wordless.wl_utils import wl_misc, wl_threading from wordless.wl_widgets import wl_labels, wl_layouts, wl_tables, wl_widgets @@ -317,10 +317,10 @@ def update_gui_table(self, err_msg, concordance_lines): self.set_item_num(i, 0, parallel_unit_no) self.set_item_num(i, 1, parallel_unit_no, len_parallel_units) - for j, (parallel_unit_raw, parallel_unit_search) in enumerate(concordance_line[1]): - label_parallel_unit = wl_labels.Wl_Label_Html(' '.join(parallel_unit_raw), self.main) - label_parallel_unit.text_raw = parallel_unit_raw - label_parallel_unit.text_search = parallel_unit_search + for j, (parallel_unit_tokens_raw, parallel_unit_tokens_search) in enumerate(concordance_line[1]): + label_parallel_unit = wl_labels.Wl_Label_Html(' '.join(parallel_unit_tokens_raw), self.main) + label_parallel_unit.tokens_raw = parallel_unit_tokens_raw + label_parallel_unit.tokens_search = parallel_unit_tokens_search self.setIndexWidget(self.model().index(i, 2 + j), label_parallel_unit) self.indexWidget(self.model().index(i, 2 + j)).setAlignment(Qt.AlignHCenter | Qt.AlignVCenter) @@ -351,9 +351,8 @@ def run(self): # Parallel Unit No. for file in files: - text = copy.deepcopy(file['text']) text = wl_token_processing.wl_process_tokens_concordancer( - self.main, text, + self.main, file['text'], token_settings = settings['token_settings'], preserve_blank_lines = True ) @@ -374,7 +373,6 @@ def run(self): search_terms = wl_matching.match_search_terms_ngrams( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], search_settings = settings['search_settings'] ) @@ -385,7 +383,6 @@ def run(self): ) = wl_matching.match_search_terms_context( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], context_settings = settings['context_settings'] ) @@ -434,35 +431,24 @@ def run(self): node = parallel_unit_nodes[i] if parallel_unit_no <= len_parallel_units: + parallel_unit_tokens_raw = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1])) + parallel_unit_tokens_raw = wl_nlp_utils.escape_tokens(wl_texts.to_display_texts(parallel_unit_tokens_raw)) # Search in Results - if settings['token_settings']['punc_marks']: - parallel_unit_raw = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1])) - else: - offset_para_start = offsets_paras[parallel_unit_no - 1] - - if parallel_unit_no == len_parallel_units: - offset_para_end = None - else: - offset_para_end = offsets_paras[parallel_unit_no] - - parallel_unit_raw = text.tokens_flat_punc_marks_merged[offset_para_start:offset_para_end] - - parallel_unit_raw = wl_nlp_utils.escape_tokens(parallel_unit_raw) - parallel_unit_search = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1])) + parallel_unit_tokens_search = list(wl_misc.flatten_list(text.tokens_multilevel[parallel_unit_no - 1])) # Highlight node if found if node: len_node = len(node) - for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit_search, len_node)): + for j, ngram in enumerate(wl_nlp_utils.ngrams(parallel_unit_tokens_search, len_node)): if ngram == tuple(node): - parallel_unit_raw[j] = f'{parallel_unit_raw[j]}' - parallel_unit_raw[j + len_node - 1] += '' + parallel_unit_tokens_raw[j] = f'{parallel_unit_tokens_raw[j]}' + parallel_unit_tokens_raw[j + len_node - 1] += '' else: - parallel_unit_raw = [] - parallel_unit_search = [] + parallel_unit_tokens_raw = [] + parallel_unit_tokens_search = [] - parallel_unit_nodes[i] = [parallel_unit_raw, parallel_unit_search] + parallel_unit_nodes[i] = [parallel_unit_tokens_raw, parallel_unit_tokens_search] # Remove empty concordance lines for parallel_unit_no, parallel_units_files in parallel_units.copy().items(): diff --git a/wordless/wl_dependency_parser.py b/wordless/wl_dependency_parser.py index 62fa61c28..6dc0d83fe 100644 --- a/wordless/wl_dependency_parser.py +++ b/wordless/wl_dependency_parser.py @@ -29,7 +29,7 @@ from wordless.wl_checks import wl_checks_work_area from wordless.wl_dialogs import wl_dialogs_misc -from wordless.wl_nlp import wl_dependency_parsing, wl_matching, wl_token_processing +from wordless.wl_nlp import wl_dependency_parsing, wl_matching, wl_texts, wl_token_processing from wordless.wl_utils import wl_misc, wl_threading from wordless.wl_widgets import wl_layouts, wl_tables, wl_widgets @@ -383,12 +383,12 @@ def update_gui_table(self, err_msg, results): for i, ( head, dependent, dependency_relation, dependency_len, - sentence_display, sentence_search, + sentence_tokens_raw, sentence_tokens_search, no_sentence, len_sentences, file ) in enumerate(results): # Head self.model().setItem(i, 0, wl_tables.Wl_Table_Item(head)) - # Dependant + # Dependent self.model().setItem(i, 1, wl_tables.Wl_Table_Item(dependent)) # Dependency Relation self.model().setItem(i, 2, wl_tables.Wl_Table_Item(dependency_relation)) @@ -396,9 +396,9 @@ def update_gui_table(self, err_msg, results): self.set_item_num(i, 3, dependency_len) self.set_item_num(i, 4, numpy.abs(dependency_len)) # Sentence - self.model().setItem(i, 5, wl_tables.Wl_Table_Item(' '.join(sentence_display))) - self.model().item(i, 5).text_display = sentence_display - self.model().item(i, 5).text_search = sentence_search + self.model().setItem(i, 5, wl_tables.Wl_Table_Item(' '.join(sentence_tokens_raw))) + self.model().item(i, 5).tokens_raw = sentence_tokens_raw + self.model().item(i, 5).tokens_search = sentence_tokens_search # Sentence No. self.set_item_num(i, 6, no_sentence) self.set_item_num(i, 7, no_sentence, len_sentences) @@ -424,7 +424,7 @@ def generate_fig(self): fig_settings = self.main.settings_custom['dependency_parser']['fig_settings'] for row in self.get_selected_rows(): - sentence = tuple(self.model().item(row, 5).text_display) + sentence = tuple(self.model().item(row, 5).tokens_search) if sentence not in sentences_rendered: for file in self.settings['file_area']['files_open']: @@ -435,11 +435,10 @@ def generate_fig(self): self.main, inputs = sentence, lang = file_selected['lang'], - tagged = file_selected['tagged'], show_pos_tags = fig_settings['show_pos_tags'], show_fine_grained_pos_tags = fig_settings['show_fine_grained_pos_tags'], show_lemmas = fig_settings['show_pos_tags'] and fig_settings['show_lemmas'], - # Let "Token Settings - Punctuation marks" to decide whether to collapse punctuation marks + # Handled by Token Settings - Punctuation marks collapse_punc_marks = False, compact_mode = fig_settings['compact_mode'], show_in_separate_tab = fig_settings['show_in_separate_tab'], @@ -468,9 +467,8 @@ def run(self): settings = self.main.settings_custom['dependency_parser'] for file in self.main.wl_file_area.get_selected_files(): - text = copy.deepcopy(file['text']) - text = wl_token_processing.wl_process_tokens_concordancer( - self.main, text, + text = wl_token_processing.wl_process_tokens_dependency_parser( + self.main, file['text'], token_settings = settings['token_settings'] ) @@ -480,7 +478,6 @@ def run(self): search_terms = wl_matching.match_search_terms_tokens( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], search_settings = settings['search_settings'] ) @@ -491,7 +488,6 @@ def run(self): ) = wl_matching.match_search_terms_context( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], context_settings = settings['context_settings'] ) @@ -504,12 +500,10 @@ def run(self): sentence = list(wl_misc.flatten_list(sentence)) if any((token in search_terms for token in sentence)): - dependencies = wl_dependency_parsing.wl_dependency_parse( - self.main, - inputs = sentence, - lang = text.lang, - tagged = text.tagged - ) + dependencies = [ + (token, token.head, token.dependency_relation, token.dependency_len) + for token in sentence + ] for i, (token, head, dependency_relation, dependency_len) in enumerate(dependencies): j = i_token + i @@ -529,25 +523,20 @@ def run(self): no_sentence = bisect.bisect(offsets_sentences, j) # Sentence - if no_sentence == len_sentences: - offset_end = None - else: - offset_end = offsets_sentences[no_sentence] - - sentence_display = text.tokens_flat_punc_marks_merged[offsets_sentences[no_sentence - 1]:offset_end] + sentence_tokens_raw = wl_texts.to_display_texts(sentence) # Remove empty tokens for searching in results - sentence_search = [token for token in sentence if token] + sentence_tokens_search = [token for token in sentence if token] # Head - results[-1].append(head) - # Dependant - results[-1].append(token) + results[-1].append(head.display_text()) + # Dependent + results[-1].append(token.display_text()) # Dependency Relation results[-1].append(dependency_relation) # Dependency Distance results[-1].append(dependency_len) # Sentence - results[-1].extend([sentence_display, sentence_search]) + results[-1].extend([sentence_tokens_raw, sentence_tokens_search]) # Sentence No. results[-1].extend([no_sentence, len_sentences]) # File diff --git a/wordless/wl_figs/wl_figs_freqs.py b/wordless/wl_figs/wl_figs_freqs.py index 5541eb4e1..e1e6103ff 100644 --- a/wordless/wl_figs/wl_figs_freqs.py +++ b/wordless/wl_figs/wl_figs_freqs.py @@ -19,11 +19,39 @@ from PyQt5.QtCore import QCoreApplication from wordless.wl_figs import wl_figs +from wordless.wl_nlp import wl_texts from wordless.wl_utils import wl_sorting _tr = QCoreApplication.translate def wl_fig_freqs(main, freq_files_items, tab): + fig_settings = main.settings_custom[tab]['fig_settings'] + + # Tokens / Keywords + if freq_files_items and isinstance(list(freq_files_items.keys())[0], str): + freq_files_items = { + item.display_text(): freq_files + for item, freq_files in freq_files_items.items() + } + # N-grams + elif freq_files_items and isinstance(list(freq_files_items.keys())[0][0], str): + freq_files_items = { + ' '.join(wl_texts.to_display_texts(item)): freq_files + for item, freq_files in freq_files_items.items() + } + # Collocations / Colligations + else: + if fig_settings['graph_type'] == _tr('wl_figs_freqs', 'Network graph'): + freq_files_items = { + (' '.join(wl_texts.to_display_texts(node)), collocate.display_text()): freq_files + for (node, collocate), freq_files in freq_files_items.items() + } + else: + freq_files_items = { + ' '.join(wl_texts.to_display_texts(node)) + ', ' + collocate.display_text(): freq_files + for (node, collocate), freq_files in freq_files_items.items() + } + if tab == 'keyword_extractor': file_names_selected = [ _tr('wl_figs_freqs', 'Reference files'), @@ -36,7 +64,6 @@ def wl_fig_freqs(main, freq_files_items, tab): _tr('wl_figs_freqs', 'Total') ] - fig_settings = main.settings_custom[tab]['fig_settings'] col_sort_by_file = file_names_selected.index(fig_settings['sort_by_file']) if tab == 'keyword_extractor': diff --git a/wordless/wl_figs/wl_figs_stats.py b/wordless/wl_figs/wl_figs_stats.py index 3a818cc89..8c528aa04 100644 --- a/wordless/wl_figs/wl_figs_stats.py +++ b/wordless/wl_figs/wl_figs_stats.py @@ -19,13 +19,40 @@ from PyQt5.QtCore import QCoreApplication from wordless.wl_figs import wl_figs +from wordless.wl_nlp import wl_texts from wordless.wl_utils import wl_sorting _tr = QCoreApplication.translate def wl_fig_stats(main, stat_files_items, tab): - file_names_selected = [*main.wl_file_area.get_selected_file_names(), _tr('wl_figs_stats', 'Total')] fig_settings = main.settings_custom[tab]['fig_settings'] + + # Tokens / Keywords + if stat_files_items and isinstance(list(stat_files_items.keys())[0], str): + stat_files_items = { + item.display_text(): stat_files + for item, stat_files in stat_files_items.items() + } + # N-grams + elif stat_files_items and isinstance(list(stat_files_items.keys())[0][0], str): + stat_files_items = { + ' '.join(wl_texts.to_display_texts(item)): stat_files + for item, stat_files in stat_files_items.items() + } + # Collocations / Colligations + else: + if fig_settings['graph_type'] == _tr('wl_figs_freqs', 'Network graph'): + stat_files_items = { + (' '.join(wl_texts.to_display_texts(node)), collocate.display_text()): stat_files + for (node, collocate), stat_files in stat_files_items.items() + } + else: + stat_files_items = { + ' '.join(wl_texts.to_display_texts(node)) + ', ' + collocate.display_text(): stat_files + for (node, collocate), stat_files in stat_files_items.items() + } + + file_names_selected = [*main.wl_file_area.get_selected_file_names(), _tr('wl_figs_stats', 'Total')] col_sort_by_file = file_names_selected.index(fig_settings['sort_by_file']) if fig_settings['use_data'] == _tr('wl_figs_stats', 'p-value'): diff --git a/wordless/wl_file_area.py b/wordless/wl_file_area.py index be459198f..efe686a24 100644 --- a/wordless/wl_file_area.py +++ b/wordless/wl_file_area.py @@ -190,12 +190,12 @@ def __init__(self, parent): drag_drop = True ) - self.setHorizontalHeader(Wl_Table_Header_Files(Qt.Horizontal, self)) - self.file_area = parent self.file_type = self.file_area.file_type self.settings_suffix = self.file_area.settings_suffix + self.setHorizontalHeader(Wl_Table_Header_Files(Qt.Horizontal, self)) + self.setItemDelegateForColumn(1, wl_item_delegates.Wl_Item_Delegate_Uneditable(self)) self.setItemDelegateForColumn(2, wl_item_delegates.Wl_Item_Delegate_Uneditable(self)) self.setItemDelegateForColumn(3, wl_item_delegates.Wl_Item_Delegate_Uneditable(self)) @@ -513,11 +513,11 @@ def __init__(self, main): self.load_settings() def accept(self): - num_files = self.main.settings_custom['file_area']['files_open'] + self.main.settings_custom['file_area']['files_open_ref'] + num_files = len(self.main.settings_custom['file_area']['files_open'] + self.main.settings_custom['file_area']['files_open_ref']) self.main.tabs_file_area.currentWidget().table_files._open_files(files_to_open = self.table_files.files_to_open) - if num_files < self.main.settings_custom['file_area']['files_open'] + self.main.settings_custom['file_area']['files_open_ref']: + if num_files < len(self.main.settings_custom['file_area']['files_open'] + self.main.settings_custom['file_area']['files_open_ref']): super().accept() def reject(self): diff --git a/wordless/wl_keyword_extractor.py b/wordless/wl_keyword_extractor.py index a85c3c2b7..fdd6f65c1 100644 --- a/wordless/wl_keyword_extractor.py +++ b/wordless/wl_keyword_extractor.py @@ -546,7 +546,8 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files): self.set_item_num(i, 0, -1) # Keyword - self.model().setItem(i, 1, wl_tables.Wl_Table_Item(keyword)) + self.model().setItem(i, 1, wl_tables.Wl_Table_Item(keyword.display_text())) + self.model().item(i, 1).tokens_filter = [keyword] # Frequency for j, freq in enumerate(freq_files): @@ -690,53 +691,42 @@ def run(self): # Frequency (Reference files) self.keywords_freq_files.append(collections.Counter()) tokens_ref = [] - len_tokens_ref = 0 for file_ref in files_ref: - text = copy.deepcopy(file_ref['text']) text = wl_token_processing.wl_process_tokens( - self.main, text, + self.main, file_ref['text'], token_settings = settings['token_settings'] ) # Remove empty tokens - tokens_flat = text.get_tokens_flat() - tokens = [token for token in tokens_flat if token] + tokens = text.get_tokens_flat() - self.keywords_freq_files[0] += collections.Counter(tokens) + self.keywords_freq_files[0] += collections.Counter([token for token in tokens if token]) - tokens_ref.extend(tokens_flat) - len_tokens_ref += len(tokens_ref) + tokens_ref.extend(tokens) + + len_tokens_ref = len(tokens_ref) # Frequency (Observed files) for file_observed in files_observed: - text = copy.deepcopy(file_observed['text']) text = wl_token_processing.wl_process_tokens( - self.main, text, + self.main, file_observed['text'], token_settings = settings['token_settings'] ) # Remove empty tokens - tokens_flat = text.get_tokens_flat() - tokens = [token for token in tokens_flat if token] + tokens = text.get_tokens_flat() - self.keywords_freq_files.append(collections.Counter(tokens)) + self.keywords_freq_files.append(collections.Counter([token for token in tokens if token])) texts.append(text) # Total if len(files_observed) > 1: - text_total = wl_texts.Wl_Text_Blank() - text_total.tokens_multilevel = [ - copy.deepcopy(para) - for text in texts - for para in text.tokens_multilevel - ] + texts.append(wl_texts.Wl_Text_Total(texts)) self.keywords_freq_files.append(sum(self.keywords_freq_files[1:], collections.Counter())) - texts.append(text_total) - # Remove tokens that do not appear in any of the observed files self.keywords_freq_files[0] = { token: freq @@ -766,7 +756,6 @@ def run(self): keywords_freq_file_observed = self.keywords_freq_files[i + 1] tokens_observed = text.get_tokens_flat() - len_tokens_observed = len(tokens_observed) if to_sections_statistical_significance: freqs_sections_tokens_statistical_significance = wl_measure_utils.to_freqs_sections_statistical_significance( @@ -791,6 +780,8 @@ def run(self): o21s = numpy.empty(shape = num_keywords_all, dtype = float) o22s = numpy.empty(shape = num_keywords_all, dtype = float) + len_tokens_observed = text.num_tokens + for i, token in enumerate(keywords_all): o11s[i] = keywords_freq_file_observed.get(token, 0) o12s[i] = keywords_freq_file_ref.get(token, 0) diff --git a/wordless/wl_measures/wl_measures_lexical_diversity.py b/wordless/wl_measures/wl_measures_lexical_diversity.py index 4602b73bf..0a53437ea 100644 --- a/wordless/wl_measures/wl_measures_lexical_diversity.py +++ b/wordless/wl_measures/wl_measures_lexical_diversity.py @@ -94,8 +94,7 @@ def hdd(main, tokens): ttrs = numpy.empty(len(list(tokens_freqs))) # Short texts - if num_tokens < sample_size: - sample_size = num_tokens + sample_size = min(sample_size, num_tokens) for i, freq in enumerate(tokens_freqs.values()): ttrs[i] = scipy.stats.hypergeom.pmf(k = 0, M = num_tokens, n = freq, N = sample_size) diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py index 93a1ac6d9..c0d026cd8 100644 --- a/wordless/wl_measures/wl_measures_readability.py +++ b/wordless/wl_measures/wl_measures_readability.py @@ -25,7 +25,7 @@ from PyQt5.QtCore import QCoreApplication from wordless.wl_checks import wl_checks_tokens -from wordless.wl_nlp import wl_lemmatization, wl_pos_tagging, wl_sentence_tokenization, wl_syl_tokenization +from wordless.wl_nlp import wl_lemmatization, wl_pos_tagging, wl_sentence_tokenization, wl_syl_tokenization, wl_texts from wordless.wl_utils import wl_misc, wl_paths _tr = QCoreApplication.translate @@ -42,11 +42,11 @@ def get_nums(main, text): text.words_multilevel[-1].append([]) for sentence_seg in sentence: - text.words_multilevel[-1][-1].append([ + text.words_multilevel[-1][-1].append(wl_texts.to_tokens([ token for token in sentence_seg if wl_checks_tokens.is_word_alphanumeric(token) - ]) + ], lang = text.lang)) text.sentences = [ list(wl_misc.flatten_list(sentence)) @@ -63,7 +63,8 @@ def get_nums(main, text): # Number of syllables if 'num_syls' not in text.__dict__ and text.lang in main.settings_global['syl_tokenizers']: - text.syls_words = wl_syl_tokenization.wl_syl_tokenize(main, text.words_flat, lang = text.lang) + text.words_flat = wl_syl_tokenization.wl_syl_tokenize(main, text.words_flat, lang = text.lang) + text.syls_words = wl_texts.get_token_properties(text.words_flat, 'syls') text.num_syls = sum((len(syls) for syls in text.syls_words)) # Number of characters @@ -113,17 +114,17 @@ def get_num_words_syls(syls_words, len_min = 1, len_max = None): )) def get_num_words_pos_tags(main, words, lang, pos_tag): - words_tagged = wl_pos_tagging.wl_pos_tag(main, words, lang = lang, tagset = 'universal') + words = wl_pos_tagging.wl_pos_tag(main, words, lang = lang, tagset = 'universal', force = True) - return sum((1 for _, pos in words_tagged if pos_tag in pos)) + return sum((1 for word in words if pos_tag in word.tag)) def get_nums_words_pos_tags(main, words, lang, pos_tags): nums = [] - words_tagged = wl_pos_tagging.wl_pos_tag(main, words, lang = lang, tagset = 'universal') + words = wl_pos_tagging.wl_pos_tag(main, words, lang = lang, tagset = 'universal', force = True) for pos_tag in pos_tags: - nums.append(sum((1 for _, pos in words_tagged if pos_tag in pos))) + nums.append(sum((1 for word in words if pos_tag in word.tag))) return nums @@ -788,9 +789,16 @@ def fog_index(main, text): _tr('wl_measures_readability', 'Original'), 'Powers-Sumner-Kearl' ]: - words_tagged = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = text.lang, tagset = 'universal') + words_tagged = wl_pos_tagging.wl_pos_tag( + main, text.words_flat, + lang = text.lang, + tagset = 'universal', + force = True + ) + + for syls, word in zip(text.syls_words, words_tagged): + tag = word.tag - for syls, (word, tag) in zip(text.syls_words, words_tagged): if ( 'PROPN' not in tag and ( @@ -881,7 +889,10 @@ def lensear_write(main, text): sample = text.words_flat[sample_start : sample_start + 100] num_words_1_syl = 0 - sysl_sample = wl_syl_tokenization.wl_syl_tokenize(main, sample, lang = text.lang) + sysl_sample = wl_texts.get_token_properties( + wl_syl_tokenization.wl_syl_tokenize(main, sample, lang = text.lang), + 'syls' + ) for syls in sysl_sample: if len(syls) == 1 and syls[0].lower() not in ['the', 'is', 'are', 'was', 'were']: @@ -1163,7 +1174,10 @@ def smog_grade(main, text): num_words_3_plus_syls = 0 for sentence in sample: - syls_words = wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang) + syls_words = wl_texts.get_token_properties( + wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang), + 'syls' + ) num_words_3_plus_syls += get_num_words_syls(syls_words, len_min = 3) @@ -1234,7 +1248,10 @@ def strain_index(main, text): num_syls = 0 for sentence in text.sentences[:3]: - syls_words = wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang) + syls_words = wl_texts.get_token_properties( + wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang), + 'syls' + ) num_syls += sum((len(syls) for syls in syls_words)) @@ -1313,71 +1330,23 @@ def td(main, text): # Wheeler & Smith's Readability Formula # Reference: Wheeler, L. R., & Smith, E. H. (1954). A practical readability formula for the classroom teacher in the primary grades. Elementary English, 31(7), 397–399. -UNIT_TERMINATORS = ''.join([ - # Sentence terminators plus colons and semicolons - '\u0021', '\u002E', '\u003A', '\u003B', '\u003F', - '\u037E', - '\u0589', - '\u061B', '\u061D', '\u061E', '\u061F', '\u06D4', - '\u0700', '\u0701', '\u0702', '\u0703', '\u0704', '\u0705', '\u0706', '\u0707', '\u0708', '\u0709', - '\u07F9', - '\u0837', '\u0839', '\u083D', '\u083E', - '\u0964', '\u0965', - '\u104A', '\u104B', - '\u1362', '\u1364', '\u1365', '\u1366', '\u1367', '\u1368', - '\u166E', - '\u1735', '\u1736', - '\u17D4', '\u17D5', - '\u1803', '\u1804', '\u1809', - '\u1944', '\u1945', - '\u1AA8', '\u1AA9', '\u1AAA', '\u1AAB', - '\u1B5A', '\u1B5B', '\u1B5E', '\u1B5F', '\u1B7D', '\u1B7E', - '\u1C3B', '\u1C3C', - '\u1C7E', '\u1C7F', - '\u203C', '\u2047', '\u2048', '\u2049', '\u203D', - '\u2E2E', '\u2E53', '\u2E54', '\u2E3C', - '\u3002', - '\uA4FF', - '\uA60E', '\uA60F', - '\uA6F3', '\uA6F4', '\uA6F6', '\uA6F7', - '\uA876', '\uA877', - '\uA8CE', '\uA8CF', - '\uA92F', - '\uA9C8', '\uA9C9', - '\uAA5D', '\uAA5E', '\uAA5F', - '\uAAF0', '\uAAF1', '\uABEB', - '\uFE52', '\uFE54', '\uFE55', '\uFE56', '\uFE57', - '\uFF01', '\uFF0E', '\uFF1A', '\uFF1B', '\uFF1F', '\uFF61', - '\U00010857', - '\U00010A56', '\U00010A57', - '\U00010B99', '\U00010B9A', - '\U00010F55', '\U00010F56', '\U00010F57', '\U00010F58', '\U00010F59', - '\U00010F86', '\U00010F87', '\U00010F88', '\U00010F89', - '\U00011047', '\U00011048', - '\U000110BE', '\U000110BF', '\U000110C0', '\U000110C1', - '\U00011141', '\U00011142', '\U00011143', - '\U000111C5', '\U000111C6', '\U000111CD', '\U000111DE', '\U000111DF', - '\U00011238', '\U00011239', '\U0001123B', '\U0001123C', - '\U000112A9', - '\U0001144B', '\U0001144C', - '\U000115C2', '\U000115C3', '\U000115C9', '\U000115CA', '\U000115CB', '\U000115CC', '\U000115CD', '\U000115CE', '\U000115CF', '\U000115D0', '\U000115D1', '\U000115D2', '\U000115D3', '\U000115D4', '\U000115D5', '\U000115D6', '\U000115D7', - '\U00011641', '\U00011642', - '\U0001173C', '\U0001173D', '\U0001173E', - '\U00011944', '\U00011946', - '\U00011A42', '\U00011A43', - '\U00011A9B', '\U00011A9C', - '\U00011C41', '\U00011C42', - '\U00011EF7', '\U00011EF8', - '\U00011F43', '\U00011F44', +UNIT_TERMINATORS = ''.join(list(wl_sentence_tokenization.SENTENCE_TERMINATORS) + list(dict.fromkeys([ + # Colons and semicolons: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:name=/COLON/:]%26[:General_Category=/Punctuation/:] + '\u003A', '\u003B', + '\u061B', + '\u0703', '\u0704', '\u0705', '\u0706', '\u0707', '\u0708', '\u0709', + '\u1364', '\u1365', '\u1366', + '\u1804', + '\u204F', '\u205D', + '\u2E35', + '\uA6F4', '\uA6F6', + '\uFE13', '\uFE14', + '\uFE54', '\uFE55', + '\uFF1A', '\uFF1B', '\U00012471', '\U00012472', '\U00012473', '\U00012474', - '\U00016A6E', '\U00016A6F', - '\U00016AF5', - '\U00016B37', '\U00016B38', '\U00016B44', - '\U00016E98', - '\U0001BC9F', - '\U0001DA88', '\U0001DA89', '\U0001DA8A', - - # Dashes: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Dash%CE%B2=Yes:] + '\U0001DA89', '\U0001DA8A', + + # Dashes: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Dash=Yes:] '\\\u002D', # The hyphen character needs to be escaped in RegEx square brackets '\u058A', '\u05BE', @@ -1393,7 +1362,7 @@ def td(main, text): '\uFE58', '\uFE63', '\uFF0D', '\U00010EAD' -]) +]))) def wheeler_smiths_readability_formula(main, text): if text.lang in main.settings_global['syl_tokenizers']: diff --git a/wordless/wl_ngram_generator.py b/wordless/wl_ngram_generator.py index f71ad1139..6e4564bd0 100644 --- a/wordless/wl_ngram_generator.py +++ b/wordless/wl_ngram_generator.py @@ -658,9 +658,9 @@ def update_gui_table(self, err_msg, ngrams_freq_files, ngrams_stats_files): self.set_item_num(i, 0, -1) # N-gram - self.model().setItem(i, 1, wl_tables.Wl_Table_Item(' '.join(ngram))) - - self.model().item(i, 1).text_raw = ngram + self.model().setItem(i, 1, wl_tables.Wl_Table_Item(' '.join(wl_texts.to_display_texts(ngram)))) + self.model().item(i, 1).tokens_search = ngram + self.model().item(i, 1).tokens_filter = ngram # Frequency for j, freq in enumerate(freq_files): @@ -719,11 +719,6 @@ def update_gui_fig(self, err_msg, ngrams_freq_files, ngrams_stats_files): settings = self.main.settings_custom['ngram_generator'] if settings['fig_settings']['use_data'] == self.tr('Frequency'): - ngrams_freq_files = { - ' '.join(ngram): freqs - for ngram, freqs in ngrams_freq_files.items() - } - wl_figs_freqs.wl_fig_freqs( self.main, ngrams_freq_files, tab = 'ngram_generator' @@ -735,11 +730,6 @@ def update_gui_fig(self, err_msg, ngrams_freq_files, ngrams_stats_files): col_text_dispersion = self.main.settings_global['measures_dispersion'][measure_dispersion]['col_text'] col_text_adjusted_freq = self.main.settings_global['measures_adjusted_freq'][measure_adjusted_freq]['col_text'] - ngrams_stats_files = { - ' '.join(ngram): stats - for ngram, stats in ngrams_stats_files.items() - } - if settings['fig_settings']['use_data'] == col_text_dispersion: ngrams_stat_files = { ngram: numpy.array(stats_files)[:, 0] @@ -791,9 +781,8 @@ def run(self): for file in files: ngrams_is = [] - text = copy.deepcopy(file['text']) text = wl_token_processing.wl_process_tokens( - self.main, text, + self.main, file['text'], token_settings = settings['token_settings'] ) tokens = text.get_tokens_flat() @@ -821,7 +810,6 @@ def run(self): search_terms = wl_matching.match_search_terms_ngrams( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], search_settings = settings['search_settings'] ) @@ -832,7 +820,6 @@ def run(self): ) = wl_matching.match_search_terms_context( self.main, tokens, lang = text.lang, - tagged = text.tagged, token_settings = settings['token_settings'], context_settings = settings['context_settings'] ) @@ -876,20 +863,13 @@ def run(self): # Total if len(files) > 1: - text_total = wl_texts.Wl_Text_Blank() - text_total.tokens_multilevel = [ - copy.deepcopy(para) - for text in texts - for para in text.tokens_multilevel - ] + texts.append(wl_texts.Wl_Text_Total(texts)) self.ngrams_freq_files.append(sum([ collections.Counter(ngrams_freq_file) for ngrams_freq_file in self.ngrams_freq_files ], collections.Counter())) - texts.append(text_total) - # Dispersion & Adjusted Frequency measure_dispersion = settings['generation_settings']['measure_dispersion'] measure_adjusted_freq = settings['generation_settings']['measure_adjusted_freq'] diff --git a/wordless/wl_nlp/wl_dependency_parsing.py b/wordless/wl_nlp/wl_dependency_parsing.py index 3e877232e..b85d915ce 100644 --- a/wordless/wl_nlp/wl_dependency_parsing.py +++ b/wordless/wl_nlp/wl_dependency_parsing.py @@ -26,7 +26,7 @@ from wordless.wl_checks import wl_checks_misc from wordless.wl_dialogs import wl_msg_boxes -from wordless.wl_nlp import wl_matching, wl_nlp_utils +from wordless.wl_nlp import wl_nlp_utils, wl_texts from wordless.wl_settings import wl_settings_default from wordless.wl_utils import wl_conversion, wl_misc, wl_paths @@ -34,25 +34,53 @@ is_windows, is_macos, is_linux = wl_misc.check_os() -def wl_dependency_parse(main, inputs, lang, dependency_parser = 'default', tagged = False): - if dependency_parser == 'default': - dependency_parser = main.settings_custom['dependency_parsing']['dependency_parser_settings'][lang] +def wl_dependency_parse(main, inputs, lang, dependency_parser = 'default', force = False): + if ( + not isinstance(inputs, str) + and inputs + and list(inputs)[0].head is not None + and not force + ): + return inputs + else: + if dependency_parser == 'default': + dependency_parser = main.settings_custom['dependency_parsing']['dependency_parser_settings'][lang] + + wl_nlp_utils.init_dependency_parsers( + main, + lang = lang, + dependency_parser = dependency_parser, + tokenized = not isinstance(inputs, str) + ) - wl_nlp_utils.init_dependency_parsers( - main, - lang = lang, - dependency_parser = dependency_parser, - tokenized = not isinstance(inputs, str) - ) + if isinstance(inputs, str): + texts, dependencies = wl_dependency_parse_text(main, inputs, lang, dependency_parser) + tokens = wl_texts.to_tokens(texts, lang = lang) - if isinstance(inputs, str): - dependencies = wl_dependency_parse_text(main, inputs, lang, dependency_parser) - else: - dependencies = wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged) + for token, (_, head_i, dependency_relation, dependency_len) in zip(tokens, dependencies): + token.head = tokens[head_i] + token.dependency_relation = dependency_relation + token.dependency_len = dependency_len - return dependencies + return tokens + else: + texts, token_properties = wl_texts.split_texts_properties(inputs) + + dependencies = wl_dependency_parse_tokens(main, texts, lang, dependency_parser) + + tokens = wl_texts.combine_texts_properties(texts, token_properties) + + for token, (_, head_i, dependency_relation, dependency_len) in zip(tokens, dependencies): + token.head = inputs[head_i] + token.dependency_relation = dependency_relation + token.dependency_len = dependency_len + + wl_texts.update_token_properties(inputs, tokens) + + return inputs def wl_dependency_parse_text(main, inputs, lang, dependency_parser): + tokens = [] dependencies = [] # spaCy @@ -65,14 +93,19 @@ def wl_dependency_parse_text(main, inputs, lang, dependency_parser): for pipeline in ['tagger', 'morphologizer', 'lemmatizer', 'attribute_ruler', 'senter', 'sentencizer'] if nlp.has_pipe(pipeline) ]): + i_head_start = 0 + for doc in nlp.pipe(inputs.splitlines()): for token in doc: + tokens.append(token.text) dependencies.append(( - token.text, token.head.text, + i_head_start + token.head.i, token.dep_, token.head.i - token.i )) + + i_head_start += len(doc) # Stanza elif dependency_parser.startswith('stanza_'): if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: @@ -80,27 +113,25 @@ def wl_dependency_parse_text(main, inputs, lang, dependency_parser): nlp = main.__dict__[f'stanza_nlp_{lang}'] lines = [line.strip() for line in inputs.splitlines() if line.strip()] + i_head_start = 0 for doc in nlp.bulk_process(lines): for sentence in doc.sentences: - for token in sentence.words: + for i, token in enumerate(sentence.words): + tokens.append(token.text) dependencies.append(( - token.text, sentence.words[token.head - 1].text if token.head > 0 else token.text, + i_head_start + token.head - 1 if token.head > 0 else i_head_start + i, token.deprel, token.head - token.id if token.head > 0 else 0 )) - return dependencies + i_head_start += len(sentence.words) -def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged): - dependencies = [] + return tokens, dependencies - # Discard empty tokens since they are useless for dependency parsing and spacy.tokens.Doc does not accept empty strings - inputs = [token for token in inputs if token] - - if tagged: - inputs, tags = wl_matching.split_tokens_tags(main, inputs) +def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser): + dependencies = [] # spaCy if dependency_parser.startswith('spacy_'): @@ -112,17 +143,21 @@ def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged): for pipeline in ['senter', 'sentencizer'] if nlp.has_pipe(pipeline) ]): + i_head_start = 0 + for doc in nlp.pipe([ spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [True] * len(tokens)) for tokens in wl_nlp_utils.split_token_list(main, inputs, dependency_parser) ]): for token in doc: dependencies.append(( - token.text, token.head.text, + i_head_start + token.head.i, token.dep_, token.head.i - token.i )) + + i_head_start += len(doc) # Stanza elif dependency_parser.startswith('stanza_'): if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: @@ -131,36 +166,28 @@ def wl_dependency_parse_tokens(main, inputs, lang, dependency_parser, tagged): lang_stanza = lang nlp = main.__dict__[f'stanza_nlp_{lang_stanza}'] + i_head_start = 0 for doc in nlp.bulk_process([ [tokens] for tokens in wl_nlp_utils.split_token_list(main, inputs, dependency_parser) ]): for sentence in doc.sentences: - for token in sentence.words: + for i, token in enumerate(sentence.words): dependencies.append(( - token.text, sentence.words[token.head - 1].text if token.head > 0 else token.text, + i_head_start + token.head - 1 if token.head > 0 else i_head_start + i, token.deprel, token.head - token.id if token.head > 0 else 0 )) - if tagged: - for i, dependency in enumerate(dependencies): - token, head, dependency_relation, dependency_dist = dependency - - dependencies[i] = ( - token + tags[i], - head + tags[i + dependency_dist], - dependency_relation, - dependency_dist - ) + i_head_start += len(sentence.words) return dependencies def wl_dependency_parse_fig( main, inputs, - lang, dependency_parser = 'default', tagged = False, + lang, dependency_parser = 'default', show_pos_tags = True, show_fine_grained_pos_tags = False, show_lemmas = False, collapse_punc_marks = True, compact_mode = False, show_in_separate_tab = False @@ -186,7 +213,7 @@ def wl_dependency_parse_fig( else: htmls = wl_dependency_parse_fig_tokens( main, inputs, - lang, dependency_parser, tagged, + lang, dependency_parser, show_pos_tags, show_fine_grained_pos_tags, show_lemmas, collapse_punc_marks, compact_mode, show_in_separate_tab @@ -206,7 +233,7 @@ def _get_pipelines_disabled(show_pos_tags, show_lemmas): return pipelines_disabled -def to_displacy_sentence(lang, sentence, token_tags = None): +def to_displacy_sentence(lang, sentence, token_properties = None): words = [] tags = [] pos = [] @@ -224,14 +251,18 @@ def to_displacy_sentence(lang, sentence, token_tags = None): ]: len_sentence = len(sentence.words) - if token_tags is not None: - token_tags = reversed(token_tags) + if token_properties is not None: + token_properties = reversed(token_properties) for i, word in enumerate(reversed(sentence.words)): - if token_tags is None: + if token_properties is None: words.append(word.text) else: - words.append(word.text + token_tags[i]) + words.append( + word.text + + (token_properties[i]['punc_mark'] or '') + + (token_properties[i]['tag'] or '') + ) if word.xpos is not None: tags.append(word.xpos) @@ -253,10 +284,14 @@ def to_displacy_sentence(lang, sentence, token_tags = None): heads.append(len_sentence - word.head) else: for i, word in enumerate(sentence.words): - if token_tags is None: + if token_properties is None: words.append(word.text) else: - words.append(word.text + token_tags[i]) + words.append( + word.text + + (token_properties[i]['punc_mark'] or '') + + (token_properties[i]['tag'] or '') + ) if word.xpos is not None: tags.append(word.xpos) @@ -362,17 +397,17 @@ def wl_dependency_parse_fig_text( def wl_dependency_parse_fig_tokens( main, inputs, - lang, dependency_parser, tagged, + lang, dependency_parser, show_pos_tags, show_fine_grained_pos_tags, show_lemmas, collapse_punc_marks, compact_mode, show_in_separate_tab ): htmls = [] - if tagged: - inputs, tags = wl_matching.split_tokens_tags(main, inputs) + if inputs and isinstance(list(inputs)[0], wl_texts.Wl_Token): + inputs, token_properties = wl_texts.split_texts_properties(inputs) else: - tags = [''] * len(inputs) + token_properties = [] options = { 'fine_grained': show_fine_grained_pos_tags, @@ -392,25 +427,21 @@ def wl_dependency_parse_fig_tokens( if nlp.has_pipe(pipeline) ]): docs = [] - lens_docs = [] for tokens in wl_nlp_utils.split_token_list(main, inputs, dependency_parser): docs.append(spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [True] * len(tokens))) - # Record length of each section - if tagged: - lens_docs.append(len(tokens)) + if token_properties: + i_tag_start = 0 if show_in_separate_tab: - for i_doc, doc in enumerate(nlp.pipe(docs)): + for doc in nlp.pipe(docs): for sentence in doc.sents: - # Put back tokens and tags displacy_dict = spacy.displacy.parse_deps(sentence.as_doc(), options = options) - for token, word in zip(sentence, displacy_dict['words']): - i_tag = sum(lens_docs[:i_doc]) + token.i - - word['text'] += tags[i_tag] + if token_properties: + for token, word in zip(sentence, displacy_dict['words']): + word['text'] += token_properties[i_tag_start + token.i] htmls.append(spacy.displacy.render( displacy_dict, @@ -419,21 +450,26 @@ def wl_dependency_parse_fig_tokens( options = options, manual = True )) + + if token_properties: + i_tag_start += len(doc) else: sentences = [] - for i_doc, doc in enumerate(nlp.pipe(docs)): + for doc in nlp.pipe(docs): for sentence in doc.sents: - # Put back tokens and tags displacy_dict = spacy.displacy.parse_deps(sentence.as_doc(), options = options) - for token, word in zip(sentence, displacy_dict['words']): - i_tag = sum(lens_docs[:i_doc]) + token.i - - word['text'] += tags[i_tag] + if token_properties: + for token, word in zip(sentence, displacy_dict['words']): + properties = token_properties[i_tag_start + token.i] + word['text'] += (properties['punc_mark'] or '') + (properties['tag']) sentences.append(displacy_dict) + if token_properties: + i_tag_start += len(doc) + htmls.append(spacy.displacy.render( sentences, style = 'dep', @@ -456,9 +492,12 @@ def wl_dependency_parse_fig_tokens( for tokens in wl_nlp_utils.split_token_list(main, inputs, dependency_parser) ]): for sentence in doc.sentences: - if tagged: + if token_properties: num_words = len(sentence.words) - sentences.append(to_displacy_sentence(lang, sentence, token_tags = tags[i_tag : i_tag + num_words])) + sentences.append(to_displacy_sentence( + lang, sentence, + token_properties = token_properties[i_tag : i_tag + num_words] + )) i_tag += num_words else: diff --git a/wordless/wl_nlp/wl_lemmatization.py b/wordless/wl_nlp/wl_lemmatization.py index f5500442a..467f0da9d 100644 --- a/wordless/wl_nlp/wl_lemmatization.py +++ b/wordless/wl_nlp/wl_lemmatization.py @@ -21,40 +21,62 @@ import simplemma import spacy -from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_pos_tagging, wl_word_tokenization +from wordless.wl_nlp import wl_nlp_utils, wl_pos_tagging, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_conversion _tr = QCoreApplication.translate -def wl_lemmatize(main, inputs, lang, lemmatizer = 'default', tagged = False): - if inputs and lang in main.settings_global['lemmatizers']: - if lemmatizer == 'default': - lemmatizer = main.settings_custom['lemmatization']['lemmatizer_settings'][lang] - - wl_nlp_utils.init_word_tokenizers( - main, - lang = lang - ) - wl_nlp_utils.init_lemmatizers( - main, - lang = lang, - lemmatizer = lemmatizer, - tokenized = not isinstance(inputs, str) - ) - - if isinstance(inputs, str): - lemmas = wl_lemmatize_text(main, inputs, lang, lemmatizer) - else: - lemmas = wl_lemmatize_tokens(main, inputs, lang, lemmatizer, tagged) +def wl_lemmatize(main, inputs, lang, lemmatizer = 'default', force = False): + if ( + not isinstance(inputs, str) + and inputs + and list(inputs)[0].lemma is not None + and not force + ): + return inputs else: - if isinstance(inputs, str): - lemmas = wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang) + if inputs and lang in main.settings_global['lemmatizers']: + if lemmatizer == 'default': + lemmatizer = main.settings_custom['lemmatization']['lemmatizer_settings'][lang] + + wl_nlp_utils.init_word_tokenizers( + main, + lang = lang + ) + wl_nlp_utils.init_lemmatizers( + main, + lang = lang, + lemmatizer = lemmatizer, + tokenized = not isinstance(inputs, str) + ) + + if isinstance(inputs, str): + texts, lemmas = wl_lemmatize_text(main, inputs, lang, lemmatizer) + + return wl_texts.to_tokens(texts, lang = lang, lemmas = lemmas) + else: + texts, token_properties = wl_texts.split_texts_properties(inputs) + + lemmas = wl_lemmatize_tokens(main, texts, lang, lemmatizer) + tokens = wl_texts.combine_texts_properties(texts, token_properties) + wl_texts.set_token_properties(tokens, 'lemma', lemmas) + + wl_texts.update_token_properties(inputs, tokens) + + return inputs else: - lemmas = inputs.copy() + if isinstance(inputs, str): + tokens = wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang) + wl_texts.set_token_properties(tokens, 'lemma', wl_texts.to_token_texts(tokens)) - return lemmas + return tokens + else: + wl_texts.set_token_properties(inputs, 'lemma', wl_texts.to_token_texts(inputs)) + + return inputs def wl_lemmatize_text(main, inputs, lang, lemmatizer): + tokens = [] lemmas = [] # spaCy @@ -69,6 +91,8 @@ def wl_lemmatize_text(main, inputs, lang, lemmatizer): ]): for doc in nlp.pipe(inputs.splitlines()): for token in doc: + tokens.append(token.text) + if token.lemma_: lemmas.append(token.lemma_) else: @@ -84,6 +108,8 @@ def wl_lemmatize_text(main, inputs, lang, lemmatizer): for doc in nlp.bulk_process(lines): for sentence in doc.sentences: for token in sentence.words: + tokens.append(token.text) + if token.lemma is not None: lemmas.append(token.lemma) else: @@ -92,87 +118,78 @@ def wl_lemmatize_text(main, inputs, lang, lemmatizer): for line in inputs.splitlines(): # simplemma if lemmatizer.startswith('simplemma_'): - tokens = wl_word_tokenization.wl_word_tokenize_flat(main, line, lang = lang) + tokens_line = wl_word_tokenization.wl_word_tokenize_flat(main, line, lang = lang) + tokens_line = wl_texts.to_display_texts(tokens_line) if lang in ['hrv', 'srp_latn']: lang = 'hbs' else: lang = wl_conversion.to_iso_639_1(main, lang, no_suffix = True) - lemmas.extend([simplemma.lemmatize(token, lang = lang) for token in tokens]) + tokens.extend((str(token) for token in tokens_line)) + lemmas.extend((simplemma.lemmatize(token, lang = lang) for token in tokens_line)) # English elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() - for token, pos in wl_pos_tagging.wl_pos_tag( + for token in wl_pos_tagging.wl_pos_tag( main, line, lang = 'eng_us', pos_tagger = 'nltk_perceptron_eng', tagset = 'universal' ): - if pos == 'ADJ': - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) - elif pos in ['NOUN', 'PROPN']: - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) - elif pos == 'ADV': - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) - elif pos in ['VERB', 'AUX']: - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) - else: - lemmas.append(word_net_lemmatizer.lemmatize(token)) + tokens.append(str(token)) + + match token.tag[1:]: + case 'ADJ': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.ADJ)) + case 'NOUN' | 'PROPN': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.NOUN)) + case 'ADV': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.ADV)) + case 'VERB' | 'AUX': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.VERB)) + case _: + lemmas.append(word_net_lemmatizer.lemmatize(str(token))) # Japanese elif lemmatizer == 'sudachipy_jpn': - lemmas.extend([ - token.dictionary_form() - for token in main.sudachipy_word_tokenizer.tokenize(line) - ]) + for token in main.sudachipy_word_tokenizer.tokenize(line): + tokens.append(token.surface()) + lemmas.append(token.dictionary_form()) # Russian & Ukrainian elif lemmatizer == 'pymorphy3_morphological_analyzer': - if lang == 'rus': - morphological_analyzer = main.pymorphy3_morphological_analyzer_rus - elif lang == 'ukr': - morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr - - tokens = wl_word_tokenization.wl_word_tokenize_flat(main, line, lang = lang) - - for token in tokens: + match lang: + case 'rus': + morphological_analyzer = main.pymorphy3_morphological_analyzer_rus + case 'ukr': + morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr + + for token in wl_word_tokenization.wl_word_tokenize_flat(main, line, lang = lang): + tokens.append(str(token)) lemmas.append(morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == 'botok_bod': - tokens = main.botok_word_tokenizer.tokenize(line) + for token in main.botok_word_tokenizer.tokenize(line): + tokens.append(token.text) - for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) - # Remove empty lemmas and strip whitespace in tokens - lemmas = [ - lemma_clean - for lemma in lemmas - if (lemma_clean := str(lemma).strip()) - ] + # Strip whitespace around lemmas and remove empty lemmas + for i, lemma in reversed(list(enumerate(lemmas))): + lemmas[i] = str(lemma).strip() - return lemmas + if not lemmas[i]: + del tokens[i] + del lemmas[i] + + return tokens, lemmas -def wl_lemmatize_tokens(main, inputs, lang, lemmatizer, tagged): +def wl_lemmatize_tokens(main, inputs, lang, lemmatizer): lemma_tokens = [] lemmas = [] - empty_offsets = [] - - if tagged: - inputs, tags = wl_matching.split_tokens_tags(main, inputs) - else: - tags = [''] * len(inputs) - - # Record positions of empty tokens and tags since spacy.tokens.Doc does not accept empty strings - for i, token in reversed(list(enumerate(inputs))): - if not token.strip(): - empty_offsets.append(i) - - del inputs[i] - del tags[i] # spaCy if lemmatizer.startswith('spacy_'): @@ -238,36 +255,37 @@ def wl_lemmatize_tokens(main, inputs, lang, lemmatizer, tagged): elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() - for token, pos in wl_pos_tagging.wl_pos_tag( - main, tokens, + for token in wl_pos_tagging.wl_pos_tag( + main, wl_texts.to_tokens(tokens, lang = lang), lang = 'eng_us', pos_tagger = 'nltk_perceptron_eng', tagset = 'universal' ): - if pos == 'ADJ': - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) - elif pos in ['NOUN', 'PROPN']: - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) - elif pos == 'ADV': - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) - elif pos in ['VERB', 'AUX']: - lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) - else: - lemmas.append(word_net_lemmatizer.lemmatize(token)) + match token.tag[1:]: + case 'ADJ': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.ADJ)) + case 'NOUN' | 'PROPN': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.NOUN)) + case 'ADV': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.ADV)) + case 'VERB' | 'AUX': + lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.VERB)) + case _: + lemmas.append(word_net_lemmatizer.lemmatize(str(token))) lemma_tokens.extend(tokens.copy()) # Japanese elif lemmatizer == 'sudachipy_jpn': - tokens_retokenized = main.sudachipy_word_tokenizer.tokenize(''.join(tokens)) - - lemma_tokens.extend([token.surface() for token in tokens_retokenized]) - lemmas.extend([token.dictionary_form() for token in tokens_retokenized]) + for token in main.sudachipy_word_tokenizer.tokenize(''.join(tokens)): + lemma_tokens.append(token.surface()) + lemmas.append(token.dictionary_form()) # Russian & Ukrainian elif lemmatizer == 'pymorphy3_morphological_analyzer': - if lang == 'rus': - morphological_analyzer = main.pymorphy3_morphological_analyzer_rus - elif lang == 'ukr': - morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr + match lang: + case 'rus': + morphological_analyzer = main.pymorphy3_morphological_analyzer_rus + case 'ukr': + morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) @@ -285,7 +303,7 @@ def wl_lemmatize_tokens(main, inputs, lang, lemmatizer, tagged): lemma_tokens.append(token.text) - # Remove empty lemmas and strip whitespace around lemmas + # Strip whitespace around lemmas and remove empty lemmas for i, (lemma, lemma_token) in reversed(list(enumerate(zip(lemmas, lemma_tokens)))): lemmas[i] = str(lemma).strip() lemma_tokens[i] = str(lemma_token).strip() @@ -296,9 +314,4 @@ def wl_lemmatize_tokens(main, inputs, lang, lemmatizer, tagged): lemmas = wl_nlp_utils.align_tokens(inputs, lemma_tokens, lemmas, prefer_raw = True) - # Insert empty lemmas and their tags after alignment of input and output - for empty_offset in sorted(empty_offsets): - lemmas.insert(empty_offset, '') - tags.insert(empty_offset, '') - - return [lemma + tag for lemma, tag in zip(lemmas, tags)] + return lemmas diff --git a/wordless/wl_nlp/wl_matching.py b/wordless/wl_nlp/wl_matching.py index 09d252552..6290e13db 100644 --- a/wordless/wl_nlp/wl_matching.py +++ b/wordless/wl_nlp/wl_matching.py @@ -22,7 +22,7 @@ from PyQt5.QtCore import QCoreApplication -from wordless.wl_nlp import wl_lemmatization +from wordless.wl_nlp import wl_lemmatization, wl_texts _tr = QCoreApplication.translate @@ -108,14 +108,6 @@ def get_re_tags_with_tokens(main, tag_type): return '|'.join(tags_embedded + tags_non_embedded) -def split_tokens_tags(main, tokens): - re_tags = get_re_tags(main, tag_type = 'body') - - tags = [''.join(re.findall(re_tags, token)) for token in tokens] - tokens = [re.sub(re_tags, '', token) for token in tokens] - - return tokens, tags - # Search Terms def check_search_terms(search_settings, search_enabled): search_terms = set() @@ -153,21 +145,25 @@ def check_search_settings(token_settings, search_settings): def match_tokens( main, search_terms, tokens, - lang, tagged, settings + lang, settings ): + search_terms = wl_texts.display_texts_to_tokens(main, search_terms, lang) search_results = set() - # Process tokens to search - tokens_search = tokens.copy() - re_tags = get_re_tags(main, tag_type = 'body') + # Save lemmas + if tokens: + if settings['match_inflected_forms']: + for i, token in enumerate(wl_lemmatization.wl_lemmatize(main, tokens, lang)): + tokens[i] = token + + # Process tokens + tokens_search = copy.deepcopy(tokens) - if settings['match_without_tags'] and tagged: - tokens_search = [re.sub(re_tags, '', token) for token in tokens] + if settings['match_without_tags']: + wl_texts.set_token_properties(tokens_search, 'tag', '') elif settings['match_tags']: - if tagged: - tokens_search = [''.join(re.findall(re_tags, token)) for token in tokens] - else: - tokens_search = [] + wl_texts.set_token_texts(tokens_search, wl_texts.get_token_properties(tokens_search, 'tag')) + wl_texts.set_token_properties(tokens_search, 'tag', '') # Match tokens if tokens_search: @@ -175,20 +171,23 @@ def match_tokens( re_flags = 0 if settings['match_case'] else re.IGNORECASE if settings['use_regex']: - search_terms_regex = search_terms.copy() + search_terms_regex = [search_term.display_text() for search_term in search_terms] # Prevent special characters from being treated as regex else: - search_terms_regex = [re.escape(search_term) for search_term in search_terms] + search_terms_regex = [re.escape(search_term.display_text()) for search_term in search_terms] for search_term in search_terms_regex: for token, token_search in zip(tokens, tokens_search): - if re_match(search_term, token_search, flags = re_flags): + if re_match(search_term, token_search.display_text(), flags = re_flags): search_results.add(token) # Match inflected forms of search terms and search results if settings['match_inflected_forms']: - lemmas_search = wl_lemmatization.wl_lemmatize(main, tokens_search, lang, tagged = tagged) - lemmas_matched = wl_lemmatization.wl_lemmatize(main, {*search_terms, *search_results}, lang, tagged = tagged) + lemmas_search = wl_texts.get_token_properties(tokens_search, 'lemma') + lemmas_matched = wl_texts.get_token_properties( + wl_lemmatization.wl_lemmatize(main, {*search_terms, *search_results}, lang), + 'lemma' + ) for lemma_matched in set(lemmas_matched): # Always match literal strings @@ -202,7 +201,7 @@ def match_tokens( def match_ngrams( main, search_terms, tokens, - lang, tagged, settings + lang, settings ): search_results = set() @@ -211,49 +210,50 @@ def match_ngrams( for search_term in search_terms for search_term_token in search_term.split() }) + search_term_tokens = wl_texts.display_texts_to_tokens(main, search_term_tokens, lang) tokens_matched = {search_term_token: set() for search_term_token in search_term_tokens} - # Process tokens to search - tokens_search = tokens.copy() + # Save lemmas + if tokens: + if settings['match_inflected_forms']: + for i, token in enumerate(wl_lemmatization.wl_lemmatize(main, tokens, lang)): + tokens[i] = token - if (settings['match_without_tags'] or settings['match_tags']) and tagged: - tokens_search_tokens, tokens_search_tags = split_tokens_tags(main, tokens_search) + # Process tokens + tokens_search = copy.deepcopy(tokens) - if settings['match_without_tags'] and tagged: - tokens_search = tokens_search_tokens + if settings['match_without_tags']: + wl_texts.set_token_properties(tokens_search, 'tag', '') elif settings['match_tags']: - if tagged: - tokens_search = tokens_search_tags - else: - tokens_search = [] + wl_texts.set_token_texts(tokens_search, wl_texts.get_token_properties(tokens_search, 'tag')) + wl_texts.set_token_properties(tokens_search, 'tag', '') # Match n-grams if tokens_search: re_match = re.fullmatch if settings['match_whole_words'] else re.search re_flags = 0 if settings['match_case'] else re.IGNORECASE - if settings['use_regex']: - search_term_tokens_regex = search_term_tokens.copy() - # Prevent special characters from being treated as regex - else: - search_term_tokens_regex = [re.escape(token) for token in search_term_tokens] + for search_term_token in search_term_tokens: + if settings['use_regex']: + search_term_token_regex = search_term_token.display_text() + # Prevent special characters from being treated as regex + else: + search_term_token_regex = re.escape(search_term_token.display_text()) - for search_term_token in search_term_tokens_regex: for token, token_search in zip(tokens, tokens_search): - if re_match(search_term_token, token_search, flags = re_flags): - # Unescape escaped special characters - if not settings['use_regex']: - search_term_token = re.sub(r'\\(.)', r'\1', search_term_token) - + if re_match(search_term_token_regex, token_search.display_text(), flags = re_flags): tokens_matched[search_term_token].add(token) if settings['match_inflected_forms']: - lemmas_search = wl_lemmatization.wl_lemmatize(main, tokens_search, lang, tagged = tagged) + lemmas_search = wl_texts.get_token_properties(tokens_search, 'lemma') # Search for inflected forms of tokens in search results first for search_term_token, search_term_tokens_matched in copy.deepcopy(tokens_matched).items(): - lemmas_matched = wl_lemmatization.wl_lemmatize(main, search_term_tokens_matched, lang, tagged = tagged) + lemmas_matched = wl_texts.get_token_properties( + wl_lemmatization.wl_lemmatize(main, search_term_tokens_matched, lang), + 'lemma' + ) for token_matched, lemma_matched in zip(search_term_tokens_matched, lemmas_matched): # Always match literal strings @@ -263,7 +263,10 @@ def match_ngrams( if re_match(lemma_matched, lemma_search, flags = re_flags): tokens_matched[search_term_token].add(token) - lemmas_matched = wl_lemmatization.wl_lemmatize(main, search_term_tokens, lang, tagged = tagged) + lemmas_matched = wl_texts.get_token_properties( + wl_lemmatization.wl_lemmatize(main, search_term_tokens, lang), + 'lemma' + ) # Search for inflected forms of tokens in search terms for token_matched, lemma_matched in zip(search_term_tokens, lemmas_matched): @@ -277,7 +280,7 @@ def match_ngrams( for search_term in search_terms: search_term_tokens_matched = [] - for search_term_token in search_term.split(): + for search_term_token in wl_texts.display_texts_to_tokens(main, search_term.split(), lang): search_term_tokens_matched.append(tokens_matched[search_term_token]) for item in itertools.product(*search_term_tokens_matched): @@ -287,40 +290,28 @@ def match_ngrams( def match_search_terms_tokens( main, tokens, - lang, tagged, - token_settings, search_settings + lang, token_settings, search_settings ): search_terms = check_search_terms(search_settings, search_enabled = True) - # Assign part-of-speech tags - if token_settings['assign_pos_tags']: - tagged = True - if search_terms: search_terms = match_tokens( main, search_terms, tokens, - lang, tagged, - check_search_settings(token_settings, search_settings) + lang, check_search_settings(token_settings, search_settings) ) return search_terms def match_search_terms_ngrams( main, tokens, - lang, tagged, - token_settings, search_settings + lang, token_settings, search_settings ): search_terms = check_search_terms(search_settings, search_enabled = True) - # Assign part-of-speech tags - if token_settings['assign_pos_tags']: - tagged = True - if search_terms: search_terms = match_ngrams( main, search_terms, tokens, - lang, tagged, - check_search_settings(token_settings, search_settings) + lang, check_search_settings(token_settings, search_settings) ) return search_terms @@ -328,16 +319,11 @@ def match_search_terms_ngrams( # Context def match_search_terms_context( main, tokens, - lang, tagged, - token_settings, context_settings + lang, token_settings, context_settings ): search_terms_incl = set() search_terms_excl = set() - # Assign part-of-speech tags - if token_settings['assign_pos_tags']: - tagged = True - # Inclusion search_terms = check_search_terms( search_settings = context_settings['incl'], @@ -347,8 +333,7 @@ def match_search_terms_context( if search_terms: search_terms_incl = match_ngrams( main, search_terms, tokens, - lang, tagged, - check_search_settings(token_settings, context_settings['incl']) + lang, check_search_settings(token_settings, context_settings['incl']) ) # Exclusion @@ -360,8 +345,7 @@ def match_search_terms_context( if search_terms: search_terms_excl = match_ngrams( main, search_terms, tokens, - lang, tagged, - check_search_settings(token_settings, context_settings['excl']) + lang, check_search_settings(token_settings, context_settings['excl']) ) return search_terms_incl, search_terms_excl diff --git a/wordless/wl_nlp/wl_pos_tagging.py b/wordless/wl_nlp/wl_pos_tagging.py index 2a413f21f..977fea6ad 100644 --- a/wordless/wl_nlp/wl_pos_tagging.py +++ b/wordless/wl_nlp/wl_pos_tagging.py @@ -23,7 +23,7 @@ import spacy import underthesea -from wordless.wl_nlp import wl_nlp_utils, wl_word_tokenization +from wordless.wl_nlp import wl_nlp_utils, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_conversion UNIVERSAL_TAGSETS_SPACY = [ @@ -37,238 +37,273 @@ 'stanza_rus', 'stanza_san', 'stanza_snd', 'stanza_hsb', 'stanza_tel' ] -def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default'): - tokens_tagged = [] +def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', force = False): + if ( + not isinstance(inputs, str) + and inputs + and any( + list(inputs)[0].tag is not None + for token in inputs + ) + and not force + ): + return inputs + else: + texts_tagged = [] + tags = [] - if pos_tagger == 'default': - pos_tagger = main.settings_custom['pos_tagging']['pos_tagger_settings']['pos_taggers'][lang] - - if tagset == 'default' and main.settings_custom['pos_tagging']['pos_tagger_settings']['to_universal_pos_tags']: - tagset = 'universal' - - wl_nlp_utils.init_word_tokenizers( - main, - lang = lang - ) - wl_nlp_utils.init_pos_taggers( - main, - lang = lang, - pos_tagger = pos_tagger, - tokenized = not isinstance(inputs, str) - ) - - # Untokenized - if isinstance(inputs, str): - # spaCy - if pos_tagger.startswith('spacy_'): - lang_spacy = wl_conversion.remove_lang_code_suffixes(main, lang) - nlp = main.__dict__[f'spacy_nlp_{lang_spacy}'] - lines = [line.strip() for line in inputs.splitlines() if line.strip()] - - with nlp.select_pipes(disable = [ - pipeline - for pipeline in ['parser', 'lemmatizer', 'senter', 'sentencizer'] - if nlp.has_pipe(pipeline) - ]): - for doc in nlp.pipe(lines): - if tagset in ['default', 'raw']: - tokens_tagged.extend([(token.text, token.tag_) for token in doc]) - elif tagset == 'universal': - tokens_tagged.extend([(token.text, token.pos_) for token in doc]) - # Stanza - elif pos_tagger.startswith('stanza_'): - if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: - lang_stanza = wl_conversion.remove_lang_code_suffixes(main, lang) - else: - lang_stanza = lang + if pos_tagger == 'default': + pos_tagger = main.settings_custom['pos_tagging']['pos_tagger_settings']['pos_taggers'][lang] - nlp = main.__dict__[f'stanza_nlp_{lang_stanza}'] - lines = [line.strip() for line in inputs.splitlines() if line.strip()] + if tagset == 'default' and main.settings_custom['pos_tagging']['pos_tagger_settings']['to_universal_pos_tags']: + tagset = 'universal' + + wl_nlp_utils.init_word_tokenizers( + main, + lang = lang + ) + wl_nlp_utils.init_pos_taggers( + main, + lang = lang, + pos_tagger = pos_tagger, + tokenized = not isinstance(inputs, str) + ) - for doc in nlp.bulk_process(lines): - for sentence in doc.sentences: - if tagset in ['default', 'raw']: + if isinstance(inputs, str): + # spaCy + if pos_tagger.startswith('spacy_'): + lang_spacy = wl_conversion.remove_lang_code_suffixes(main, lang) + nlp = main.__dict__[f'spacy_nlp_{lang_spacy}'] + lines = [line.strip() for line in inputs.splitlines() if line.strip()] + + with nlp.select_pipes(disable = [ + pipeline + for pipeline in ['parser', 'lemmatizer', 'senter', 'sentencizer'] + if nlp.has_pipe(pipeline) + ]): + for doc in nlp.pipe(lines): + for token in doc: + texts_tagged.append(token.text) + + if tagset in ['default', 'raw']: + tags.append(token.tag_) + elif tagset == 'universal': + tags.append(token.pos_) + # Stanza + elif pos_tagger.startswith('stanza_'): + if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: + lang_stanza = wl_conversion.remove_lang_code_suffixes(main, lang) + else: + lang_stanza = lang + + nlp = main.__dict__[f'stanza_nlp_{lang_stanza}'] + lines = [line.strip() for line in inputs.splitlines() if line.strip()] + + for doc in nlp.bulk_process(lines): + for sentence in doc.sentences: for token in sentence.words: - if token.xpos is not None: - tokens_tagged.append((token.text, token.xpos)) - else: - tokens_tagged.append((token.text, token.upos)) - elif tagset == 'universal': - tokens_tagged.extend([(token.text, token.upos) for token in sentence.words]) - else: - for line in inputs.splitlines(): - tokens_tagged.extend(wl_pos_tag_text(main, line, lang, pos_tagger)) - # Tokenized - else: - # Record positions of empty tokens since spacy.tokens.Doc does not accept empty strings - empty_offsets = [] - - for i, token in reversed(list(enumerate(inputs))): - if not token.strip(): - empty_offsets.append(i) - - del inputs[i] - - # spaCy - if pos_tagger.startswith('spacy_'): - lang_spacy = wl_conversion.remove_lang_code_suffixes(main, lang) - nlp = main.__dict__[f'spacy_nlp_{lang_spacy}'] - - with nlp.select_pipes(disable = [ - pipeline - for pipeline in ['parser', 'lemmatizer', 'senter', 'sentencizer'] - if nlp.has_pipe(pipeline) - ]): - docs = [] - - for tokens in wl_nlp_utils.split_token_list(main, inputs, pos_tagger): - # The Japanese model do not have a tagger component and Japanese POS tags are taken directly from SudachiPy - # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1910117 - if lang == 'jpn': - docs.append(''.join(tokens)) - else: - docs.append(spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [True] * len(tokens))) - - for doc in nlp.pipe(docs): - if tagset in ['default', 'raw']: - tokens_tagged.extend([(token.text, token.tag_) for token in doc]) - elif tagset == 'universal': - tokens_tagged.extend([(token.text, token.pos_) for token in doc]) - # Stanza - elif pos_tagger.startswith('stanza_'): - if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: - lang_stanza = wl_conversion.remove_lang_code_suffixes(main, lang) - else: - lang_stanza = lang + texts_tagged.append(token.text) - nlp = main.__dict__[f'stanza_nlp_{lang_stanza}'] + if tagset in ['default', 'raw']: + tags.append(token.xpos if token.xpos else token.upos) + elif tagset == 'universal': + tags.append(token.upos) + else: + for line in inputs.splitlines(): + tokens_tagged_line, tags_line = wl_pos_tag_text(main, line, lang, pos_tagger) - for doc in nlp.bulk_process([ - [tokens] - for tokens in wl_nlp_utils.split_token_list(main, inputs, pos_tagger) - ]): - for sentence in doc.sentences: - if tagset in ['default', 'raw']: - for token in sentence.words: - if token.xpos is not None: - tokens_tagged.append((token.text, token.xpos)) - else: - tokens_tagged.append((token.text, token.upos)) - elif tagset == 'universal': - tokens_tagged.extend([(token.text, token.upos) for token in sentence.words]) + texts_tagged.extend(tokens_tagged_line) + tags.extend(tags_line) else: - for tokens in wl_nlp_utils.split_token_list(main, inputs, pos_tagger): - tokens_tagged.extend(wl_pos_tag_tokens(main, tokens, lang, pos_tagger)) - - # Remove empty tokens (e.g. SudachiPy) and strip whitespace around tokens and tags - tokens_tagged = [ - (token_clean, tag.strip()) - for token, tag in tokens_tagged - if (token_clean := token.strip()) - ] + texts, token_properties = wl_texts.split_texts_properties(inputs) + + # spaCy + if pos_tagger.startswith('spacy_'): + lang_spacy = wl_conversion.remove_lang_code_suffixes(main, lang) + nlp = main.__dict__[f'spacy_nlp_{lang_spacy}'] + + with nlp.select_pipes(disable = [ + pipeline + for pipeline in ['parser', 'lemmatizer', 'senter', 'sentencizer'] + if nlp.has_pipe(pipeline) + ]): + docs = [] + + for tokens in wl_nlp_utils.split_token_list(main, texts, pos_tagger): + # The Japanese model do not have a tagger component and Japanese POS tags are taken directly from SudachiPy + # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1910117 + if lang == 'jpn': + docs.append(''.join(tokens)) + else: + docs.append(spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [True] * len(tokens))) + + for doc in nlp.pipe(docs): + for token in doc: + texts_tagged.append(token.text) + + if tagset in ['default', 'raw']: + tags.append(token.tag_) + elif tagset == 'universal': + tags.append(token.pos_) + # Stanza + elif pos_tagger.startswith('stanza_'): + if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: + lang_stanza = wl_conversion.remove_lang_code_suffixes(main, lang) + else: + lang_stanza = lang + + nlp = main.__dict__[f'stanza_nlp_{lang_stanza}'] + + for doc in nlp.bulk_process([ + [tokens] + for tokens in wl_nlp_utils.split_token_list(main, texts, pos_tagger) + ]): + for sentence in doc.sentences: + for token in sentence.words: + texts_tagged.append(token.text) - if not isinstance(inputs, str): - tokens_tagged_tokens = [item[0] for item in tokens_tagged] - tokens_tagged_tags = [item[1] for item in tokens_tagged] - tokens_tagged_tags = wl_nlp_utils.align_tokens(inputs, tokens_tagged_tokens, tokens_tagged_tags) + if tagset in ['default', 'raw']: + tags.append(token.xpos if token.xpos else token.upos) + elif tagset == 'universal': + tags.append(token.upos) + else: + for tokens in wl_nlp_utils.split_token_list(main, texts, pos_tagger): + results = wl_pos_tag_tokens(main, tokens, lang, pos_tagger) + + texts_tagged.extend(results[0]) + tags.extend(results[1]) + + # Remove empty tokens (e.g. SudachiPy) and strip whitespace around tokens and tags + tokens_tags = zip(texts_tagged.copy(), tags.copy()) + texts_tagged.clear() + tags.clear() + + for token, tag in tokens_tags: + if (token_clean := token.strip()): + texts_tagged.append(token_clean) + tags.append(tag.strip()) + + if not isinstance(inputs, str): + tags = wl_nlp_utils.align_tokens(texts, texts_tagged, tags) + + # Convert to universal POS tags + if ( + tagset == 'universal' + and ( + ( + not pos_tagger.startswith('spacy_') + and not pos_tagger.startswith('stanza_') + ) + or pos_tagger in UNIVERSAL_TAGSETS_SPACY + or pos_tagger in UNIVERSAL_TAGSETS_STANZA + ) + ): + mappings = { + tag: tag_universal + for tag, tag_universal, _, _ in main.settings_custom['pos_tagging']['tagsets']['mapping_settings'][lang][pos_tagger] + } - tokens_tagged = list(zip(inputs, tokens_tagged_tags)) + tags = [mappings.get(tag, 'X') for tag in tags] - # Insert empty tokens after alignment of input and output - for empty_offset in sorted(empty_offsets): - tokens_tagged.insert(empty_offset, ('', '')) + # Add separators between tokens and tags + tags = [f'_{tag}' for tag in tags] - # Convert to universal POS tags - if ( - tagset == 'universal' - and ( - ( - not pos_tagger.startswith('spacy_') - and not pos_tagger.startswith('stanza_') - ) - or pos_tagger in UNIVERSAL_TAGSETS_SPACY - or pos_tagger in UNIVERSAL_TAGSETS_STANZA - ) - ): - mappings = { - tag: tag_universal - for tag, tag_universal, _, _ in main.settings_custom['pos_tagging']['tagsets']['mapping_settings'][lang][pos_tagger] - } + if isinstance(inputs, str): + return wl_texts.to_tokens(texts_tagged, lang = lang, tags = tags) + else: + tokens = wl_texts.combine_texts_properties(texts, token_properties) + wl_texts.set_token_properties(tokens, 'tag', tags) - tokens_tagged = [ - (token, mappings.get(tag, 'X')) - for token, tag in list(tokens_tagged) - ] + wl_texts.update_token_properties(inputs, tokens) - return tokens_tagged + return inputs def wl_pos_tag_text(main, text, lang, pos_tagger): tokens_tagged = [] + tags = [] # English & Russian if pos_tagger.startswith('nltk_perceptron_'): + tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) + tokens = wl_texts.to_token_texts(tokens) lang = wl_conversion.remove_lang_code_suffixes(main, lang) - tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) - tokens_tagged = nltk.pos_tag(tokens, lang = lang) + for token, tag in nltk.pos_tag(tokens, lang = lang): + tokens_tagged.append(token) + tags.append(tag) # Japanese elif pos_tagger == 'sudachipy_jpn': - tokens_tagged = [ - (token.surface(), '-'.join([pos for pos in token.part_of_speech()[:4] if pos != '*'])) - for token in main.sudachipy_word_tokenizer.tokenize(text) - ] + for token in main.sudachipy_word_tokenizer.tokenize(text): + tokens_tagged.append(token.surface()) + tags.append('-'.join([pos for pos in token.part_of_speech()[:4] if pos != '*'])) # Khmer elif pos_tagger == 'khmer_nltk_khm': - tokens_tagged = khmernltk.pos_tag(text) + for token, tag in khmernltk.pos_tag(text): + tokens_tagged.append(token) + tags.append(tag) # Korean elif pos_tagger == 'python_mecab_ko_mecab': - tokens_tagged = main.python_mecab_ko_mecab.pos(text) + for token, tag in main.python_mecab_ko_mecab.pos(text): + tokens_tagged.append(token) + tags.append(tag) # Lao elif pos_tagger.startswith('laonlp_'): tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) + tokens = wl_texts.to_token_texts(tokens) if pos_tagger == 'laonlp_seqlabeling': - tokens_tagged = laonlp.pos_tag(tokens, corpus = 'SeqLabeling') + results = laonlp.pos_tag(tokens, corpus = 'SeqLabeling') if pos_tagger == 'laonlp_yunshan_cup_2020': - tokens_tagged = laonlp.pos_tag(tokens, corpus = 'yunshan_cup_2020') + results = laonlp.pos_tag(tokens, corpus = 'yunshan_cup_2020') + + tokens_tagged = [token for token, _ in results] + tags = [tag for _, tag in results] # Russian & Ukrainian elif pos_tagger == 'pymorphy3_morphological_analyzer': - if lang == 'rus': - morphological_analyzer = main.pymorphy3_morphological_analyzer_rus - elif lang == 'ukr': - morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr + match lang: + case 'rus': + morphological_analyzer = main.pymorphy3_morphological_analyzer_rus + case 'ukr': + morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) for token in tokens: - tokens_tagged.append((token, morphological_analyzer.parse(token)[0].tag._POS)) + tokens_tagged.append(token) + tags.append(morphological_analyzer.parse(token)[0].tag._POS) # Thai elif pos_tagger.startswith('pythainlp_'): tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) - - if pos_tagger == 'pythainlp_perceptron_blackboard': - tokens_tagged = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'blackboard') - elif pos_tagger == 'pythainlp_perceptron_orchid': - tokens_tagged = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'orchid') - elif pos_tagger == 'pythainlp_perceptron_pud': - tokens_tagged = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'pud') + tokens = wl_texts.to_token_texts(tokens) + + match pos_tagger: + case 'pythainlp_perceptron_blackboard': + results = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'blackboard') + case 'pythainlp_perceptron_orchid': + results = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'orchid') + case 'pythainlp_perceptron_pud': + results = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'pud') + + tokens_tagged = [token for token, _ in results] + tags = [tag for _, tag in results] # Tibetan elif pos_tagger == 'botok_bod': tokens = main.botok_word_tokenizer.tokenize(text) for token in tokens: - if token.pos: - tokens_tagged.append((token.text, token.pos)) - else: - tokens_tagged.append((token.text, token.chunk_type)) + tokens_tagged.append(token.text) + tags.append(token.pos if token.pos else token.chunk_type) # Vietnamese elif pos_tagger == 'underthesea_vie': - tokens_tagged = underthesea.pos_tag(text) + for token, tag in underthesea.pos_tag(text): + tokens_tagged.append(token) + tags.append(tag) - return list(tokens_tagged) + return tokens_tagged, tags def wl_pos_tag_tokens(main, tokens, lang, pos_tagger): tokens_tagged = [] + tags = [] lang = wl_conversion.remove_lang_code_suffixes(main, lang) @@ -276,52 +311,63 @@ def wl_pos_tag_tokens(main, tokens, lang, pos_tagger): if pos_tagger.startswith('nltk_perceptron_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) - tokens_tagged = nltk.pos_tag(tokens, lang = lang) + for token, tag in nltk.pos_tag(tokens, lang = lang): + tokens_tagged.append(token) + tags.append(tag) # Japanese elif pos_tagger == 'sudachipy_jpn': - tokens_tagged = [ - (token.surface(), '-'.join([pos for pos in token.part_of_speech()[:4] if pos != '*'])) - for token in main.sudachipy_word_tokenizer.tokenize(''.join(tokens)) - ] + for token in main.sudachipy_word_tokenizer.tokenize(''.join(tokens)): + tokens_tagged.append(token.surface()) + tags.append('-'.join([pos for pos in token.part_of_speech()[:4] if pos != '*'])) # Khmer elif pos_tagger == 'khmer_nltk_khm': - tokens_tagged = khmernltk.pos_tag(''.join(tokens)) + for token, tag in khmernltk.pos_tag(''.join(tokens)): + tokens_tagged.append(token) + tags.append(tag) # Korean elif pos_tagger == 'python_mecab_ko_mecab': - tokens_tagged = wl_pos_tag_text(main, ' '.join(tokens), lang = 'kor', pos_tagger = 'python_mecab_ko_mecab') + tokens_tagged, tags = wl_pos_tag_text(main, ' '.join(tokens), lang = 'kor', pos_tagger = 'python_mecab_ko_mecab') # Lao elif pos_tagger.startswith('laonlp_'): if pos_tagger == 'laonlp_seqlabeling': - tokens_tagged = laonlp.pos_tag(tokens, corpus = 'SeqLabeling') + results = laonlp.pos_tag(tokens, corpus = 'SeqLabeling') if pos_tagger == 'laonlp_yunshan_cup_2020': - tokens_tagged = laonlp.pos_tag(tokens, corpus = 'yunshan_cup_2020') + results = laonlp.pos_tag(tokens, corpus = 'yunshan_cup_2020') + + tokens_tagged = [token for token, _ in results] + tags = [tag for _, tag in results] # Russian & Ukrainian elif pos_tagger == 'pymorphy3_morphological_analyzer': - if lang == 'rus': - morphological_analyzer = main.pymorphy3_morphological_analyzer_rus - elif lang == 'ukr': - morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr + match lang: + case 'rus': + morphological_analyzer = main.pymorphy3_morphological_analyzer_rus + case 'ukr': + morphological_analyzer = main.pymorphy3_morphological_analyzer_ukr for token in tokens: - tokens_tagged.append((token, morphological_analyzer.parse(token)[0].tag._POS)) + tokens_tagged.append(token) + tags.append(morphological_analyzer.parse(token)[0].tag._POS) # Thai - elif pos_tagger == 'pythainlp_perceptron_blackboard': - tokens_tagged = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'blackboard') - elif pos_tagger == 'pythainlp_perceptron_orchid': - tokens_tagged = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'orchid') - elif pos_tagger == 'pythainlp_perceptron_pud': - tokens_tagged = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'pud') + elif pos_tagger.startswith('pythainlp_'): + match pos_tagger: + case 'pythainlp_perceptron_blackboard': + results = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'blackboard') + case 'pythainlp_perceptron_orchid': + results = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'orchid') + case 'pythainlp_perceptron_pud': + results = pythainlp.tag.pos_tag(tokens, engine = 'perceptron', corpus = 'pud') + + tokens_tagged = [token for token, _ in results] + tags = [tag for _, tag in results] # Tibetan elif pos_tagger == 'botok_bod': - tokens_retokenized = main.botok_word_tokenizer.tokenize(''.join(tokens)) - - for token in tokens_retokenized: - if token.pos: - tokens_tagged.append((token.text, token.pos)) - else: - tokens_tagged.append((token.text, token.chunk_type)) + for token in main.botok_word_tokenizer.tokenize(''.join(tokens)): + tokens_tagged.append(token.text) + tags.append(token.pos if token.pos else token.chunk_type) # Vietnamese elif pos_tagger == 'underthesea_vie': - tokens_tagged = underthesea.pos_tag(' '.join(tokens)) + for token, tag in underthesea.pos_tag(' '.join(tokens)): + tokens_tagged.append(token) + tags.append(tag) - return list(tokens_tagged) + return tokens_tagged, tags diff --git a/wordless/wl_nlp/wl_sentence_tokenization.py b/wordless/wl_nlp/wl_sentence_tokenization.py index ffb051aeb..5de285c59 100644 --- a/wordless/wl_nlp/wl_sentence_tokenization.py +++ b/wordless/wl_nlp/wl_sentence_tokenization.py @@ -27,7 +27,7 @@ import pythainlp import underthesea -from wordless.wl_nlp import wl_nlp_utils +from wordless.wl_nlp import wl_nlp_utils, wl_texts from wordless.wl_utils import wl_conversion, wl_misc LANG_TEXTS_NLTK = { @@ -141,19 +141,12 @@ def wl_sentence_tokenize(main, text, lang, sentence_tokenizer = 'default'): elif sentence_tokenizer == 'underthesea_vie': sentences.extend(underthesea.sent_tokenize(line)) - # Strip spaces - sentences = [ - sentence_clean - for sentence in sentences - if (sentence_clean := sentence.strip()) - ] - - return sentences + return wl_texts.clean_texts(sentences) # References: # https://stackoverflow.com/questions/9506869/are-there-character-collections-for-all-international-full-stop-punctuations/9508766#9508766 -# https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ATerminal_Punctuation%CE%B2%3DYes%3A%5D%26%5B%3ASentence_Break%CE%B2%3D%2F%5BAS%5DTerm%2F%3A%5D&g=&i= -SENTENCE_TERMINATORS = ''.join([ +# https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation=Yes:]%26[:Sentence_Break=/[AS]Term/:] +SENTENCE_TERMINATORS = ''.join(list(dict.fromkeys([ '\u0021', '\u002E', '\u003F', '\u0589', '\u061D', '\u061E', '\u061F', '\u06D4', @@ -211,7 +204,7 @@ def wl_sentence_tokenize(main, text, lang, sentence_tokenizer = 'default'): '\U00016E98', '\U0001BC9F', '\U0001DA88' -]) +]))) def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): return [ @@ -219,8 +212,8 @@ def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): for sentence in re.findall(fr'.+?[{terminators}][{terminators}\s]*|.+?$', text.strip()) ] -# Reference: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation%CE%B2=Yes:] -SENTENCE_SEG_TERMINATORS = ''.join([ +# Reference: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation=Yes:] +SENTENCE_SEG_TERMINATORS = ''.join(list(dict.fromkeys([ '\u0021', '\u002C', '\u002E', '\u003A', '\u003B', '\u003F', '\u037E', '\u0387', '\u0589', @@ -294,7 +287,7 @@ def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): '\U00016E97', '\U00016E98', '\U0001BC9F', '\U0001DA87', '\U0001DA88', '\U0001DA89', '\U0001DA8A' -]) +]))) def wl_sentence_seg_tokenize(main, text, terminators = SENTENCE_SEG_TERMINATORS): return [ @@ -317,16 +310,12 @@ def wl_sentence_seg_tokenize_tokens(main, tokens, terminators = SENTENCE_SEG_TER text = REPLACEMENT_CHAR.join(tokens) for sentence_seg in re.findall(fr'.+?[{terminators}]+{REPLACEMENT_CHAR}|.+?$', text.strip()): - sentence_segs.append([ - token_clean - for token in sentence_seg.split(REPLACEMENT_CHAR) - if (token_clean := token.strip()) - ]) + sentence_segs.append(wl_texts.clean_texts(sentence_seg.split(REPLACEMENT_CHAR))) # If lengths do not match (only possible in edge cases), return all tokens as 1 sentence segment if list(wl_misc.flatten_list(sentence_segs)) != tokens: sentence_segs = [tokens] - print('Warning: lengths do not match!') + print('[Warning] Lengths do not match!') return sentence_segs diff --git a/wordless/wl_nlp/wl_sentiment_analysis.py b/wordless/wl_nlp/wl_sentiment_analysis.py index b707d24ac..8ffeb180c 100644 --- a/wordless/wl_nlp/wl_sentiment_analysis.py +++ b/wordless/wl_nlp/wl_sentiment_analysis.py @@ -24,7 +24,7 @@ import underthesea import vaderSentiment.vaderSentiment -from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_word_tokenization +from wordless.wl_nlp import wl_nlp_utils, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_conversion, wl_paths VADER_EXCEPTIONS_ENG = [ @@ -34,7 +34,7 @@ vaderSentiment.vaderSentiment.SPECIAL_CASES ] -def wl_sentiment_analyze(main, inputs, lang, sentiment_analyzer = 'default', tagged = False): +def wl_sentiment_analyze(main, inputs, lang, sentiment_analyzer = 'default'): if sentiment_analyzer == 'default': sentiment_analyzer = main.settings_custom['sentiment_analysis']['sentiment_analyzer_settings'][lang] @@ -49,7 +49,7 @@ def wl_sentiment_analyze(main, inputs, lang, sentiment_analyzer = 'default', tag if isinstance(inputs[0], str): sentiment_scores = wl_sentiment_analyze_text(main, inputs, lang, sentiment_analyzer) else: - sentiment_scores = wl_sentiment_analyze_tokens(main, inputs, lang, sentiment_analyzer, tagged) + sentiment_scores = wl_sentiment_analyze_tokens(main, inputs, lang, sentiment_analyzer) else: sentiment_scores = [] @@ -133,12 +133,9 @@ def wl_sentiment_analyze_text(main, inputs, lang, sentiment_analyzer): return sentiment_scores -def wl_sentiment_analyze_tokens(main, inputs, lang, sentiment_analyzer, tagged): +def wl_sentiment_analyze_tokens(main, inputs, lang, sentiment_analyzer): sentiment_scores = [] - if tagged: - inputs = [wl_matching.split_tokens_tags(main, tokens)[0] for tokens in inputs] - # Stanza if sentiment_analyzer.startswith('stanza_'): if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: @@ -151,7 +148,7 @@ def wl_sentiment_analyze_tokens(main, inputs, lang, sentiment_analyzer, tagged): sentiments = [] for doc in nlp.bulk_process([ - [tokens] + [wl_texts.to_token_texts(tokens)] for tokens in wl_nlp_utils.split_token_list(main, tokens_input, sentiment_analyzer) ]): for sentence in doc.sentences: diff --git a/wordless/wl_nlp/wl_syl_tokenization.py b/wordless/wl_nlp/wl_syl_tokenization.py index 625aaa350..83b97178e 100644 --- a/wordless/wl_nlp/wl_syl_tokenization.py +++ b/wordless/wl_nlp/wl_syl_tokenization.py @@ -20,58 +20,70 @@ import pythainlp -from wordless.wl_checks import wl_checks_tokens -from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_word_tokenization +from wordless.wl_nlp import wl_nlp_utils, wl_texts, wl_word_tokenization + +def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False): + if ( + not isinstance(inputs, str) + and inputs + and list(inputs)[0].syls is not None + and not force + ): + return inputs + else: + if inputs and lang in main.settings_global['syl_tokenizers']: + syls_tokens = [] -def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', tagged = False): - if inputs and lang in main.settings_global['syl_tokenizers']: - syls_tokens = [] + if syl_tokenizer == 'default': + syl_tokenizer = main.settings_custom['syl_tokenization']['syl_tokenizer_settings'][lang] - if syl_tokenizer == 'default': - syl_tokenizer = main.settings_custom['syl_tokenization']['syl_tokenizer_settings'][lang] + wl_nlp_utils.init_syl_tokenizers( + main, + lang = lang, + syl_tokenizer = syl_tokenizer + ) - wl_nlp_utils.init_syl_tokenizers( - main, - lang = lang, - syl_tokenizer = syl_tokenizer - ) + if isinstance(inputs, str): + tokens = wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang) + texts = wl_texts.to_token_texts(tokens) + else: + texts, token_properties = wl_texts.split_texts_properties(inputs) - section_size = main.settings_custom['files']['misc_settings']['read_files_in_chunks'] + section_size = main.settings_custom['files']['misc_settings']['read_files_in_chunks'] + texts_sections = wl_nlp_utils.to_sections_unequal(texts, section_size = section_size * 50) - if isinstance(inputs, str): - for line in inputs.splitlines(): - syls_tokens.extend(wl_syl_tokenize_text(main, line, lang, syl_tokenizer, tagged)) - else: - texts = wl_nlp_utils.to_sections_unequal(inputs, section_size = section_size * 50) + for texts_section in texts_sections: + syls_tokens.extend(wl_syl_tokenize_tokens(main, texts_section, lang, syl_tokenizer)) - for tokens in texts: - syls_tokens.extend(wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer, tagged)) - else: - if isinstance(inputs, str): - syls_tokens = [[token] for token in wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang)] - else: - syls_tokens = [[token] for token in inputs] + # Remove empty syllables and whitespace around syllables + syls_tokens = [ + tuple(wl_texts.clean_texts(syls)) + for syls in syls_tokens + if any(syls) + ] - # Remove empty syllables and whitespace around syllables - syls_tokens = [ - [syl_clean for syl in syls if (syl_clean := syl.strip())] - for syls in syls_tokens - if any(syls) - ] + if isinstance(inputs, str): + wl_texts.set_token_properties(tokens, 'syls', syls_tokens) - return syls_tokens + return tokens + else: + tokens = wl_texts.combine_texts_properties(texts, token_properties) + wl_texts.set_token_properties(tokens, 'syls', syls_tokens) -def wl_syl_tokenize_text(main, text, lang, syl_tokenizer, tagged): - tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) + wl_texts.update_token_properties(inputs, tokens) - return wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer, tagged) + return inputs + # Do not set syllable properties if syllable tokenization is not supported + else: + if isinstance(inputs, str): + tokens = wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang) -def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer, tagged): - syls_tokens = [] + return tokens + else: + return inputs - # Separate tokens and tags - if tagged: - tokens, tags = wl_matching.split_tokens_tags(main, tokens) +def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer): + syls_tokens = [] for token in tokens: # NLTK @@ -96,23 +108,4 @@ def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer, tagged): elif syl_tokenizer == 'pythainlp_tha': syls_tokens.append(pythainlp.subword_tokenize(token, engine = 'dict')) - # Put back tokens and tags - if tagged: - for syls, tag in zip(syls_tokens, tags): - syls[-1] += tag - - return syls_tokens - -# Excluding punctuation marks -def wl_syl_tokenize_tokens_no_punc(main, tokens, lang, syl_tokenizer = 'default', tagged = False): - syls_tokens = wl_syl_tokenize(main, tokens, lang, syl_tokenizer, tagged) - - for i, syls in reversed(list(enumerate(syls_tokens))): - if len(syls) == 1: - # Separate token and tag - syl, _ = wl_matching.split_tokens_tags(main, [syls[0]]) - - if wl_checks_tokens.is_punc(syl[0]): - del syls_tokens[i] - return syls_tokens diff --git a/wordless/wl_nlp/wl_texts.py b/wordless/wl_nlp/wl_texts.py index 2c0bd653c..05dc26082 100644 --- a/wordless/wl_nlp/wl_texts.py +++ b/wordless/wl_nlp/wl_texts.py @@ -16,6 +16,7 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import copy import os import re @@ -29,6 +30,153 @@ RE_VIE_TOKENIZED = re.compile(r'(? 0: - text.tokens_flat_punc_marks_merged[-1] = wl_word_detokenization.wl_word_detokenize( - main, - tokens = [text.tokens_flat_punc_marks_merged[-1], token], - lang = text.lang - ) - else: - text.tokens_flat_punc_marks_merged.append(token) - - # Check if the first token is a punctuation mark - if wl_checks_tokens.is_punc(text.tokens_flat_punc_marks_merged[0]): - text.tokens_multilevel[0][0][0].insert(0, '') - else: - text.tokens_flat_punc_marks_merged = tokens_flat - - # Remove empty paragraphs - if not preserve_blank_lines: - text.tokens_multilevel = [ - [ - [ - sentence_seg - for sentence_seg in sentence - if sentence_seg - ] - for sentence in para - ] - for para in text.tokens_multilevel - ] - text.tokens_multilevel = [ - [ - sentence - for sentence in para - if sentence - ] - for para in text.tokens_multilevel - ] - text.tokens_multilevel = [ - para - for para in text.tokens_multilevel - if para - ] # Assign part-of-speech tags if settings['assign_pos_tags'] and not text.tagged: - tokens_tagged = wl_pos_tagging.wl_pos_tag( + tokens = wl_pos_tagging.wl_pos_tag( main, inputs = text.get_tokens_flat(), lang = text.lang ) + text.update_token_properties(tokens) - text.tags = [[(f'_{tag}' if tag else '')] for _, tag in tokens_tagged] - # Modify text types - text.tagged = True + text_modified = copy.deepcopy(text) - # Ignore tags - if settings['ignore_tags']: - for para in text.tokens_multilevel: - for sentence in para: - for i, sentence_seg in enumerate(sentence): - sentence[i] = [(token, []) for token in sentence_seg] + # Remove tags temporarily if text is untagged and users do not choose to assign POS tags on the fly + if not settings['assign_pos_tags'] and not text.tagged: + text_modified.set_token_properties('tag', '') - text.tokens_flat_punc_marks_merged = [ - (token, []) - for token in text.tokens_flat_punc_marks_merged - ] - else: - i_tags = 0 + # Punctuation marks + if not settings['punc_marks']: + tokens_flat_punc_marks = [] + + for i, token in enumerate(text_modified.get_tokens_flat()): + if wl_checks_tokens.is_punc(token): + # Check if the first token is a punctuation mark + if i == 0: + tokens_flat_punc_marks.append(wl_texts.Wl_Token('', lang = token.lang, punc_mark = token)) + else: + token_text = wl_word_detokenization.wl_word_detokenize( + main, + tokens = [ + str(tokens_flat_punc_marks[-1]) + (tokens_flat_punc_marks[-1].punc_mark or ''), + token + ], + lang = text_modified.lang + ) + + tokens_flat_punc_marks[-1].punc_mark = token_text.replace(str(tokens_flat_punc_marks[-1]), '') + else: + tokens_flat_punc_marks.append(token) - for para in text.tokens_multilevel: + # Remove punctuation marks to match length + for para in text_modified.tokens_multilevel: for sentence in para: - for sentence_seg in sentence: - for i, token in enumerate(sentence_seg): - sentence_seg[i] = (token, text.tags[i_tags + i]) + for i, sentence_seg in enumerate(sentence): + sentence[i] = [token for token in sentence_seg if not wl_checks_tokens.is_punc(token)] - i_tags += len(sentence_seg) + text_modified.set_tokens(tokens_flat_punc_marks) - text.tokens_flat_punc_marks_merged = list(zip(text.tokens_flat_punc_marks_merged, text.tags)) + if not preserve_blank_lines: + text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel, empty_tokens = False) + + # Ignore tags + if settings['ignore_tags']: + text_modified.set_token_properties('tag', '') # Use tags only if settings['use_tags']: - for para in text.tokens_multilevel: - for sentence in para: - for i, sentence_seg in enumerate(sentence): - sentence[i] = [''.join(tags) for (_, tags) in sentence_seg] + text_modified.set_token_texts(text_modified.get_token_properties('tag')) + text_modified.set_token_properties('tag', '') - text.tokens_flat_punc_marks_merged = [ - ''.join(tags) - for _, tags in text.tokens_flat_punc_marks_merged - ] - else: - for para in text.tokens_multilevel: - for sentence in para: - for i, sentence_seg in enumerate(sentence): - sentence[i] = [f"{token}{''.join(tags)}" for (token, tags) in sentence_seg] + text_modified.update_num_tokens() - text.tokens_flat_punc_marks_merged = [ - f"{token}{''.join(tags)}" - for token, tags in text.tokens_flat_punc_marks_merged - ] + return text_modified - return text +def wl_process_tokens_dependency_parser(main, text, token_settings): + # Dependency parsing + tokens_modified = [] -def wl_process_tokens_colligation_extractor(main, text, token_settings): - text = wl_process_tokens(main, text, token_settings) + for para in text.tokens_multilevel: + for sentence in para: + tokens_modified.extend(wl_dependency_parsing.wl_dependency_parse( + main, + inputs = list(wl_misc.flatten_list(sentence)), + lang = text.lang, + )) - # Use tags Only - if token_settings['use_tags']: - text.tags = [ - tag - for tags in text.tags - for tag in tags - ] - else: - text.tags = [ - ''.join(tags) - for tags in text.tags - ] + text.update_token_properties(tokens_modified) + + return wl_process_tokens_concordancer(main, text, token_settings) + +def wl_process_tokens_wordlist_generator(main, text, token_settings, generation_settings): + # Syllable tokenization + if generation_settings['syllabification']: + tokens = wl_syl_tokenization.wl_syl_tokenize( + main, + inputs = text.get_tokens_flat(), + lang = text.lang, + ) + text.update_token_properties(tokens) + + text_modified = wl_process_tokens(main, text, token_settings) + text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel) + text_modified.update_num_tokens() - return text + return text_modified diff --git a/wordless/wl_nlp/wl_word_detokenization.py b/wordless/wl_nlp/wl_word_detokenization.py index 7d860e5ea..6e20e81cd 100644 --- a/wordless/wl_nlp/wl_word_detokenization.py +++ b/wordless/wl_nlp/wl_word_detokenization.py @@ -21,11 +21,12 @@ import pythainlp from wordless.wl_checks import wl_checks_tokens -from wordless.wl_nlp import wl_nlp_utils +from wordless.wl_nlp import wl_nlp_utils, wl_texts from wordless.wl_utils import wl_conversion def wl_word_detokenize(main, tokens, lang): text = '' + tokens = wl_texts.clean_texts(tokens) if lang == 'other': lang = 'eng_us' diff --git a/wordless/wl_nlp/wl_word_tokenization.py b/wordless/wl_nlp/wl_word_tokenization.py index 28a368ea7..ba7025ae5 100644 --- a/wordless/wl_nlp/wl_word_tokenization.py +++ b/wordless/wl_nlp/wl_word_tokenization.py @@ -23,7 +23,7 @@ import underthesea from wordless.wl_checks import wl_checks_tokens -from wordless.wl_nlp import wl_nlp_utils, wl_sentence_tokenization +from wordless.wl_nlp import wl_nlp_utils, wl_sentence_tokenization, wl_texts from wordless.wl_utils import wl_conversion, wl_misc def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): @@ -79,24 +79,25 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): if word_tokenizer.startswith('nltk_'): sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang) - if word_tokenizer == 'nltk_nist': - for sentence in sentences: - tokens_multilevel[-1].append(main.nltk_nist_tokenizer.international_tokenize(sentence)) - elif word_tokenizer == 'nltk_nltk': - for sentence in sentences: - tokens_multilevel[-1].append(main.nltk_nltk_tokenizer.tokenize(sentence)) - elif word_tokenizer == 'nltk_penn_treebank': - for sentence in sentences: - tokens_multilevel[-1].append(main.nltk_treebank_tokenizer.tokenize(sentence)) - elif word_tokenizer == 'nltk_regex': - for sentence in sentences: - tokens_multilevel[-1].append(main.nltk_regex_tokenizer.tokenize(sentence)) - elif word_tokenizer == 'nltk_tok_tok': - for sentence in sentences: - tokens_multilevel[-1].append(main.nltk_toktok_tokenizer.tokenize(sentence)) - elif word_tokenizer == 'nltk_twitter': - for sentence in sentences: - tokens_multilevel[-1].append(main.nltk_tweet_tokenizer.tokenize(sentence)) + match word_tokenizer: + case 'nltk_nist': + for sentence in sentences: + tokens_multilevel[-1].append(main.nltk_nist_tokenizer.international_tokenize(sentence)) + case 'nltk_nltk': + for sentence in sentences: + tokens_multilevel[-1].append(main.nltk_nltk_tokenizer.tokenize(sentence)) + case 'nltk_penn_treebank': + for sentence in sentences: + tokens_multilevel[-1].append(main.nltk_treebank_tokenizer.tokenize(sentence)) + case 'nltk_regex': + for sentence in sentences: + tokens_multilevel[-1].append(main.nltk_regex_tokenizer.tokenize(sentence)) + case 'nltk_tok_tok': + for sentence in sentences: + tokens_multilevel[-1].append(main.nltk_toktok_tokenizer.tokenize(sentence)) + case 'nltk_twitter': + for sentence in sentences: + tokens_multilevel[-1].append(main.nltk_tweet_tokenizer.tokenize(sentence)) # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang = wl_conversion.remove_lang_code_suffixes(main, lang) @@ -140,12 +141,13 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): tokens_multilevel[-1].append(tokens) # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): - if word_tokenizer == 'sudachipy_jpn_split_mode_a': - mode = sudachipy.SplitMode.A - elif word_tokenizer == 'sudachipy_jpn_split_mode_b': - mode = sudachipy.SplitMode.B - elif word_tokenizer == 'sudachipy_jpn_split_mode_c': - mode = sudachipy.SplitMode.C + match word_tokenizer: + case 'sudachipy_jpn_split_mode_a': + mode = sudachipy.SplitMode.A + case 'sudachipy_jpn_split_mode_b': + mode = sudachipy.SplitMode.B + case 'sudachipy_jpn_split_mode_c': + mode = sudachipy.SplitMode.C sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = lang) @@ -242,15 +244,16 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = 'tha') for sentence in sentences: - if word_tokenizer == 'pythainlp_longest_matching': - tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'longest')) - elif word_tokenizer == 'pythainlp_max_matching': - tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'mm')) - elif word_tokenizer == 'pythainlp_max_matching_tcc': - # Use safe mode by default - tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'newmm-safe')) - elif word_tokenizer == 'pythainlp_nercut': - tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'nercut')) + match word_tokenizer: + case 'pythainlp_longest_matching': + tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'longest')) + case 'pythainlp_max_matching': + tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'mm')) + case 'pythainlp_max_matching_tcc': + # Use safe mode by default + tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'newmm-safe')) + case 'pythainlp_nercut': + tokens_multilevel[-1].append(pythainlp.word_tokenize(sentence, engine = 'nercut')) # Tibetan elif word_tokenizer == 'botok_bod': sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = 'bod') @@ -274,9 +277,12 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): # Tokenize as sentence segments for para in tokens_multilevel: for i, sentence in enumerate(para): - tokens = [token_clean for token in sentence if (token_clean := token.strip())] + para[i] = wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, wl_texts.clean_texts(sentence)) - para[i] = wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, tokens) + for para in tokens_multilevel: + for sentence in para: + for i, sentence_seg in enumerate(sentence): + sentence[i] = wl_texts.to_tokens(sentence_seg, lang) return tokens_multilevel diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py index b26f6623f..e4b58f2f8 100644 --- a/wordless/wl_profiler.py +++ b/wordless/wl_profiler.py @@ -27,7 +27,7 @@ from PyQt5.QtCore import pyqtSignal, QCoreApplication, Qt from PyQt5.QtWidgets import QDialog, QGroupBox, QPushButton, QStackedWidget, QTabWidget -from wordless.wl_checks import wl_checks_work_area +from wordless.wl_checks import wl_checks_tokens, wl_checks_work_area from wordless.wl_dialogs import wl_dialogs_misc from wordless.wl_measures import wl_measures_lexical_diversity, wl_measures_misc, wl_measures_readability from wordless.wl_nlp import wl_texts, wl_token_processing @@ -585,10 +585,12 @@ def update_gui_table(self, err_msg, text_stats_files): class Wl_Table_Profiler_Lexical_Diversity(Wl_Table_Profiler): def __init__(self, parent): HEADERS_LEXICAL_DIVERSITY = [ + _tr('wl_profiler', "Brunét's Index"), _tr('wl_profiler', 'Corrected TTR'), _tr('wl_profiler', "Fisher's Index of Diversity"), _tr('wl_profiler', "Herdan's Vₘ"), _tr('wl_profiler', 'HD-D'), + _tr('wl_profiler', "Honoré's statistic"), _tr('wl_profiler', 'LogTTR'), _tr('wl_profiler', 'Mean Segmental TTR'), _tr('wl_profiler', 'Measure of Textual Lexical Diversity'), @@ -1162,9 +1164,8 @@ def run(self): files = list(self.main.wl_file_area.get_selected_files()) for file in files: - text = copy.deepcopy(file['text']) text = wl_token_processing.wl_process_tokens_profiler( - self.main, text, + self.main, file['text'], token_settings = settings['token_settings'] ) @@ -1172,31 +1173,7 @@ def run(self): # Total if len(files) > 1: - text_total = wl_texts.Wl_Text_Blank() - - # Set language for the combined text only if all texts are in the same language - if len({text.lang for text in texts}) == 1: - text_total.lang = texts[0].lang - else: - text_total.lang = 'other' - - text_total.tokens_multilevel = [ - copy.deepcopy(para) - for text in texts - for para in text.tokens_multilevel - ] - text_total.tokens_multilevel_with_puncs = [ - copy.deepcopy(para) - for text in texts - for para in text.tokens_multilevel_with_puncs - ] - text_total.syls_tokens = [ - syls - for text in texts - for syls in text.syls_tokens - ] - - texts.append(text_total) + texts.append(wl_texts.Wl_Text_Total(texts)) for text in texts: tokens = text.get_tokens_flat() @@ -1275,14 +1252,22 @@ def run(self): for sentence_seg in sentence ] + syls_tokens = text.get_token_properties('syls') + + # Remove punctuation marks + for i, syls in enumerate(syls_tokens): + syls_tokens[i] = tuple(syl for syl in syls if not wl_checks_tokens.is_punc(syl)) + + syls_tokens = [syls for syls in syls_tokens if syls] + # Token length - len_tokens_syls = [len(syls) for syls in text.syls_tokens] + len_tokens_syls = [len(syls) for syls in syls_tokens] len_tokens_chars = [len(token) for token in tokens] # Type length - len_types_syls = [len(syls) for syls in {tuple(syls) for syls in text.syls_tokens}] + len_types_syls = [len(syls) for syls in set(syls_tokens)] len_types_chars = [len(token_type) for token_type in set(tokens)] # Syllable length - len_syls = [len(syl) for syls in text.syls_tokens for syl in syls] + len_syls = [len(syl) for syls in syls_tokens for syl in syls] else: len_paras_sentences = len_paras_sentence_segs = len_paras_tokens = None len_sentences = len_sentence_segs = None diff --git a/wordless/wl_results/wl_results_filter.py b/wordless/wl_results/wl_results_filter.py index 0d118fe0e..4e593c5b1 100644 --- a/wordless/wl_results/wl_results_filter.py +++ b/wordless/wl_results/wl_results_filter.py @@ -505,9 +505,12 @@ def run(self): self.dialog.table.row_filters = [] for i in range(self.dialog.table.model().rowCount()): - filter_len_token_ngram = ( - len_token_ngram_min <= len(self.dialog.table.model().item(i, col_token_ngram).text()) <= len_token_ngram_max - ) + # Calculate length of token texts only when filtering tagged tokens and when filtering tags + len_token_ngram = sum(( + len(str(token)) + for token in self.dialog.table.model().item(i, col_token_ngram).tokens_filter + )) + filter_len_token_ngram = len_token_ngram_min <= len_token_ngram <= len_token_ngram_max if self.dialog.tab == 'wordlist_generator': filter_num_syls = False @@ -1012,9 +1015,12 @@ def run(self): self.dialog.table.row_filters = [] for i in range(self.dialog.table.model().rowCount()): - filter_len_node = ( - len_node_min <= len(self.dialog.table.model().item(i, col_node).text()) <= len_node_max - ) + # Calculate length of token texts only when filtering tagged tokens and when filtering tags + len_node = sum(( + len(str(token)) + for token in self.dialog.table.model().item(i, col_node).tokens_filter + )) + filter_len_node = len_node_min <= len_node <= len_node_max filter_freq = ( freq_min <= self.dialog.table.model().item(i, col_freq).val <= freq_max diff --git a/wordless/wl_results/wl_results_search.py b/wordless/wl_results/wl_results_search.py index 1957cdfa4..2b574d53b 100644 --- a/wordless/wl_results/wl_results_search.py +++ b/wordless/wl_results/wl_results_search.py @@ -24,7 +24,7 @@ from wordless.wl_checks import wl_checks_work_area from wordless.wl_dialogs import wl_dialogs, wl_dialogs_misc, wl_msg_boxes -from wordless.wl_nlp import wl_matching, wl_nlp_utils +from wordless.wl_nlp import wl_matching, wl_nlp_utils, wl_texts from wordless.wl_utils import wl_misc, wl_threading from wordless.wl_widgets import wl_buttons, wl_layouts, wl_widgets @@ -372,15 +372,20 @@ def run(self): ] for col in cols_to_search: + # Concordancer - Left, Node, Right & Parallel Concordancer - Parallel Unit if table.indexWidget(table.model().index(0, col)): for row in rows_to_search: - results[(row, col)] = table.indexWidget(table.model().index(row, col)).text_search + results[(row, col)] = table.indexWidget(table.model().index(row, col)).tokens_search else: for row in rows_to_search: + # Dependency Parser - Sentence / N-gram Generator - N-gram try: - results[(row, col)] = table.model().item(row, col).text_raw + results[(row, col)] = table.model().item(row, col).tokens_search except AttributeError: - results[(row, col)] = [table.model().item(row, col).text()] + results[(row, col)] = wl_texts.display_texts_to_tokens( + self.main, + [table.model().item(row, col).text()] + ) items = [token for text in results.values() for token in text] @@ -389,7 +394,6 @@ def run(self): search_terms_file = wl_matching.match_search_terms_ngrams( self.main, items, lang = file['lang'], - tagged = file['tagged'], token_settings = table.settings[self.dialog.tab]['token_settings'], search_settings = self.dialog.settings ) @@ -401,7 +405,7 @@ def run(self): for (row, col), text in results.items(): for ngram in wl_nlp_utils.ngrams(text, len_search_term): - if ngram == search_term: + if ngram == tuple(search_term): self.dialog.items_found.append([table, row, col]) self.dialog.items_found = sorted( diff --git a/wordless/wl_results/wl_results_sort.py b/wordless/wl_results/wl_results_sort.py index 3e8b57f2d..a3c187595 100644 --- a/wordless/wl_results/wl_results_sort.py +++ b/wordless/wl_results/wl_results_sort.py @@ -100,13 +100,13 @@ def update_gui(self, results): node_new = wl_labels.Wl_Label_Html(node_old.text(), self.table) right_new = wl_labels.Wl_Label_Html('', self.table) - left_new.text_raw = left_old.text_raw.copy() - node_new.text_raw = node_old.text_raw.copy() - right_new.text_raw = right_old.text_raw.copy() + left_new.tokens_raw = left_old.tokens_raw.copy() + node_new.tokens_raw = node_old.tokens_raw.copy() + right_new.tokens_raw = right_old.tokens_raw.copy() - left_new.text_search = left_old.text_search.copy() - node_new.text_search = node_old.text_search.copy() - right_new.text_search = right_old.text_search.copy() + left_new.tokens_search = left_old.tokens_search.copy() + node_new.tokens_search = node_old.tokens_search.copy() + right_new.tokens_search = right_old.tokens_search.copy() results[i][0] = left_new results[i][1] = node_new @@ -117,7 +117,7 @@ def update_gui(self, results): reverse = 0 if sorting_order == self.tr('Ascending') else 1 if sorting_col == self.tr('Node'): - results.sort(key = lambda item: item[1].text_raw, reverse = reverse) + results.sort(key = lambda item: item[1].tokens_raw, reverse = reverse) # Sort first by type (strings after floats), then sort numerically or alphabetically elif sorting_col == self.tr('Sentiment'): results.sort(key = lambda item: (str(type(item[3])), item[3]), reverse = reverse) @@ -129,9 +129,9 @@ def update_gui(self, results): span = int(sorting_col[1:]) if re.search(self.tr(r'^L\d+$'), sorting_col): - results.sort(key = lambda item, span = span: item[0].text_raw[-span], reverse = reverse) + results.sort(key = lambda item, span = span: item[0].tokens_raw[-span], reverse = reverse) elif re.search(self.tr(r'^R\d+$'), sorting_col): - results.sort(key = lambda item, span = span: item[2].text_raw[span - 1], reverse = reverse) + results.sort(key = lambda item, span = span: item[2].tokens_raw[span - 1], reverse = reverse) self.table.disable_updates() @@ -154,8 +154,8 @@ def update_gui(self, results): file ) in enumerate(results): # Remove empty tokens - text_left = [token for token in left.text_raw if token] - text_right = [token for token in right.text_raw if token] + text_left = [token for token in left.tokens_raw if token] + text_right = [token for token in right.tokens_raw if token] # Re-apply node color node_text = re.sub( @@ -194,13 +194,13 @@ def update_gui(self, results): self.table.indexWidget(self.table.model().index(i, 1)).setText(node_text) self.table.indexWidget(self.table.model().index(i, 2)).setText(' '.join(text_right)) - self.table.indexWidget(self.table.model().index(i, 0)).text_raw = [token for token in left.text_raw if token] - self.table.indexWidget(self.table.model().index(i, 1)).text_raw = node.text_raw - self.table.indexWidget(self.table.model().index(i, 2)).text_raw = [token for token in right.text_raw if token] + self.table.indexWidget(self.table.model().index(i, 0)).tokens_raw = [token for token in left.tokens_raw if token] + self.table.indexWidget(self.table.model().index(i, 1)).tokens_raw = node.tokens_raw + self.table.indexWidget(self.table.model().index(i, 2)).tokens_raw = [token for token in right.tokens_raw if token] - self.table.indexWidget(self.table.model().index(i, 0)).text_search = left.text_search - self.table.indexWidget(self.table.model().index(i, 1)).text_search = node.text_search - self.table.indexWidget(self.table.model().index(i, 2)).text_search = right.text_search + self.table.indexWidget(self.table.model().index(i, 0)).tokens_search = left.tokens_search + self.table.indexWidget(self.table.model().index(i, 1)).tokens_search = node.tokens_search + self.table.indexWidget(self.table.model().index(i, 2)).tokens_search = right.tokens_search if isinstance(sentiment, float): self.table.set_item_num(i, 3, sentiment) @@ -324,11 +324,11 @@ def table_item_changed(self): width_right = self.table.settings['concordancer']['generation_settings']['width_right_token'] else: width_left = max(( - len(self.table.indexWidget(self.table.model().index(row, 0)).text_raw) + len(self.table.indexWidget(self.table.model().index(row, 0)).tokens_raw) for row in range(self.table.model().rowCount()) )) width_right = max(( - len(self.table.indexWidget(self.table.model().index(row, 2)).text_raw) + len(self.table.indexWidget(self.table.model().index(row, 2)).tokens_raw) for row in range(self.table.model().rowCount()) )) @@ -336,11 +336,11 @@ def table_item_changed(self): self.cols_to_sort.extend([self.tr('L') + str(i + 1) for i in range(width_left)]) elif self.table.tab == 'concordancer_parallel': width_left = max(( - len(self.table.indexWidget(self.table.model().index(row, 0)).text_raw) + len(self.table.indexWidget(self.table.model().index(row, 0)).tokens_raw) for row in range(self.table.model().rowCount()) )) width_right = max(( - len(self.table.indexWidget(self.table.model().index(row, 2)).text_raw) + len(self.table.indexWidget(self.table.model().index(row, 2)).tokens_raw) for row in range(self.table.model().rowCount()) )) @@ -486,10 +486,10 @@ def run(self): node_old = self.dialog.table.indexWidget(self.dialog.table.model().index(i, 1)) right_old = self.dialog.table.indexWidget(self.dialog.table.model().index(i, 2)) - if len(left_old.text_raw) < max_left: - left_old.text_raw = [''] * (max_left - len(left_old.text_raw)) + left_old.text_raw - if len(right_old.text_raw) < max_right: - right_old.text_raw.extend([''] * (max_right - len(right_old.text_raw))) + if len(left_old.tokens_raw) < max_left: + left_old.tokens_raw = [''] * (max_left - len(left_old.tokens_raw)) + left_old.tokens_raw + if len(right_old.tokens_raw) < max_right: + right_old.tokens_raw.extend([''] * (max_right - len(right_old.tokens_raw))) sentiment = self.dialog.table.model().item(i, 3).read_data() no_token = self.dialog.table.model().item(i, 4).val diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 4d680cefb..0ecd8a298 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -847,6 +847,7 @@ def init_settings_default(main): 'apply_lemmatization': False, 'filter_stop_words': False, + # Always assign POS tags 'assign_pos_tags': True, 'ignore_tags': False, 'use_tags': False diff --git a/wordless/wl_settings/wl_settings_figs.py b/wordless/wl_settings/wl_settings_figs.py index 7077ffc22..ab9999932 100644 --- a/wordless/wl_settings/wl_settings_figs.py +++ b/wordless/wl_settings/wl_settings_figs.py @@ -183,7 +183,7 @@ def __init__(self, main): self.tr('Monochrome'), self.tr('Colormap') ]) - self.combo_box_font_color_colormap.addItems(matplotlib.pyplot.colormaps()) + self.combo_box_font_color_colormap.addItems(matplotlib.pyplot.colormaps()) # pylint: disable=not-callable self.stacked_widget_font_color.addWidget(self.button_font_color_monochrome) self.stacked_widget_font_color.addWidget(self.combo_box_font_color_colormap) diff --git a/wordless/wl_settings/wl_settings_lemmatization.py b/wordless/wl_settings/wl_settings_lemmatization.py index e13029abd..d53d76b3e 100644 --- a/wordless/wl_settings/wl_settings_lemmatization.py +++ b/wordless/wl_settings/wl_settings_lemmatization.py @@ -22,7 +22,7 @@ from PyQt5.QtGui import QStandardItem from PyQt5.QtWidgets import QGroupBox, QLabel, QPushButton, QTextEdit -from wordless.wl_nlp import wl_lemmatization, wl_nlp_utils, wl_word_detokenization +from wordless.wl_nlp import wl_lemmatization, wl_nlp_utils, wl_texts, wl_word_detokenization from wordless.wl_settings import wl_settings from wordless.wl_utils import wl_conversion, wl_threading from wordless.wl_widgets import wl_boxes, wl_item_delegates, wl_layouts, wl_tables @@ -220,11 +220,12 @@ def run(self): for line in preview_samples.split('\n'): if (line := line.strip()): - lemmas = wl_lemmatization.wl_lemmatize( + tokens = wl_lemmatization.wl_lemmatize( self.main, line, lang = preview_lang, lemmatizer = self.lemmatizer ) + lemmas = wl_texts.get_token_properties(tokens, 'lemma') text = wl_word_detokenization.wl_word_detokenize( self.main, lemmas, lang = preview_lang diff --git a/wordless/wl_settings/wl_settings_pos_tagging.py b/wordless/wl_settings/wl_settings_pos_tagging.py index 914362d95..17bed7e00 100644 --- a/wordless/wl_settings/wl_settings_pos_tagging.py +++ b/wordless/wl_settings/wl_settings_pos_tagging.py @@ -247,14 +247,14 @@ def run(self): for line in preview_samples.split('\n'): if (line := line.strip()): - tokens_tagged = wl_pos_tagging.wl_pos_tag( + tokens = wl_pos_tagging.wl_pos_tag( self.main, line, lang = preview_lang, pos_tagger = self.pos_tagger, tagset = self.tagset ) - preview_results.append(' '.join([f'{token}_{tag}' for token, tag in tokens_tagged])) + preview_results.append(' '.join([f'{str(token)}{token.tag}' for token in tokens])) else: preview_results.append('') diff --git a/wordless/wl_settings/wl_settings_syl_tokenization.py b/wordless/wl_settings/wl_settings_syl_tokenization.py index 48901cdfe..f1d781318 100644 --- a/wordless/wl_settings/wl_settings_syl_tokenization.py +++ b/wordless/wl_settings/wl_settings_syl_tokenization.py @@ -22,7 +22,7 @@ from PyQt5.QtGui import QStandardItem from PyQt5.QtWidgets import QGroupBox, QLabel, QPushButton, QTextEdit -from wordless.wl_nlp import wl_nlp_utils, wl_syl_tokenization, wl_word_detokenization +from wordless.wl_nlp import wl_nlp_utils, wl_syl_tokenization, wl_texts, wl_word_detokenization from wordless.wl_settings import wl_settings from wordless.wl_utils import wl_conversion, wl_threading from wordless.wl_widgets import wl_boxes, wl_item_delegates, wl_layouts, wl_tables @@ -220,17 +220,18 @@ def run(self): for line in preview_samples.split('\n'): if (line := line.strip()): - syls = wl_syl_tokenization.wl_syl_tokenize( + tokens = wl_syl_tokenization.wl_syl_tokenize( self.main, line, lang = preview_lang, syl_tokenizer = self.syl_tokenizer ) + syls_tokens = wl_texts.get_token_properties(tokens, 'syls') if preview_lang == 'tha': - text = ' '.join(['-'.join(syl) for syl in syls]) + text = ' '.join(['-'.join(syls) for syls in syls_tokens]) else: text = wl_word_detokenization.wl_word_detokenize( - self.main, ['-'.join(syl) for syl in syls], + self.main, ['-'.join(syls) for syls in syls_tokens], lang = preview_lang ) diff --git a/wordless/wl_wordlist_generator.py b/wordless/wl_wordlist_generator.py index b13b82827..4f53cdede 100644 --- a/wordless/wl_wordlist_generator.py +++ b/wordless/wl_wordlist_generator.py @@ -30,7 +30,7 @@ from wordless.wl_dialogs import wl_dialogs_misc from wordless.wl_figs import wl_figs, wl_figs_freqs, wl_figs_stats from wordless.wl_measures import wl_measure_utils -from wordless.wl_nlp import wl_syl_tokenization, wl_texts, wl_token_processing +from wordless.wl_nlp import wl_texts, wl_token_processing from wordless.wl_utils import wl_conversion, wl_misc, wl_sorting, wl_threading from wordless.wl_widgets import wl_layouts, wl_tables, wl_widgets @@ -364,7 +364,7 @@ def generate_table(self): wl_threading.Wl_Thread(worker_wordlist_generator_table).start_worker() - def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabification): + def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, syls_tokens): if wl_checks_work_area.check_results(self.main, err_msg, tokens_freq_files): try: self.settings = copy.deepcopy(self.main.settings_custom) @@ -453,12 +453,20 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, token self.set_item_num(i, 0, -1) # Token - self.model().setItem(i, 1, wl_tables.Wl_Table_Item(token)) + self.model().setItem(i, 1, wl_tables.Wl_Table_Item(token.display_text())) + self.model().item(i, 1).tokens_filter = [token] # Syllabification if settings['generation_settings']['syllabification']: - if len(tokens_syllabification[token]) == 1: - token_syllabified = list(tokens_syllabification[token].values())[0] + # Use tags only + if settings['token_settings']['use_tags']: + self.set_item_err( + i, 2, + _tr('wl_wordlist_generator', 'N/A'), + alignment_hor = 'left' + ) + elif len(syls_tokens[token]) == 1: + token_syllabified = list(syls_tokens[token].values())[0] if token_syllabified == _tr('wl_wordlist_generator', 'No language support'): self.set_item_err(i, 2, token_syllabified, alignment_hor = 'left') @@ -468,9 +476,9 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, token else: token_syllabified_forms = [] - for lang, syllabified_form in tokens_syllabification[token].items(): + for lang, syllabified_form in syls_tokens[token].items(): lang_text = wl_conversion.to_lang_text(self.main, lang) - token_syllabified_forms.append(f'{syllabified_form} [{lang_text}]') + token_syllabified_forms.append(f"{syllabified_form} [{lang_text}]") tokens_syllabified = ', '.join(token_syllabified_forms) @@ -526,7 +534,7 @@ def generate_fig(self): wl_threading.Wl_Thread(self.worker_wordlist_generator_fig).start_worker() - def update_gui_fig(self, err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabification): # pylint: disable=unused-argument + def update_gui_fig(self, err_msg, tokens_freq_files, tokens_stats_files, syls_tokens): # pylint: disable=unused-argument if wl_checks_work_area.check_results(self.main, err_msg, tokens_freq_files): try: settings = self.main.settings_custom['wordlist_generator'] @@ -576,7 +584,7 @@ def __init__(self, main, dialog_progress, update_gui): self.err_msg = '' self.tokens_freq_files = [] self.tokens_stats_files = [] - self.tokens_syllabification = {} + self.syls_tokens = {} def run(self): try: @@ -586,48 +594,36 @@ def run(self): files = list(self.main.wl_file_area.get_selected_files()) for file in files: - text = copy.deepcopy(file['text']) - text = wl_token_processing.wl_process_tokens( - self.main, text, - token_settings = settings['token_settings'] + text = wl_token_processing.wl_process_tokens_wordlist_generator( + self.main, file['text'], + token_settings = settings['token_settings'], + generation_settings = settings['generation_settings'] ) - - # Remove empty tokens - tokens_flat = text.get_tokens_flat() - tokens = [token for token in tokens_flat if token] + tokens = text.get_tokens_flat() # Frequency self.tokens_freq_files.append(collections.Counter(tokens)) # Syllabification for token in set(tokens): - if token not in self.tokens_syllabification: - self.tokens_syllabification[token] = {} + if token not in self.syls_tokens: + self.syls_tokens[token] = {} - if text.lang not in self.tokens_syllabification[token]: - if text.lang in self.main.settings_global['syl_tokenizers']: - syls_tokens = wl_syl_tokenization.wl_syl_tokenize(self.main, [token], text.lang, tagged = text.tagged) - - self.tokens_syllabification[token][text.lang] = '-'.join(syls_tokens[0]) + if text.lang not in self.syls_tokens[token]: + if token.syls: + self.syls_tokens[token][text.lang] = '-'.join(token.syls) else: - self.tokens_syllabification[token][text.lang] = _tr('wl_wordlist_generator', 'No language support') + self.syls_tokens[token][text.lang] = _tr('wl_wordlist_generator', 'No language support') texts.append(text) # Total if len(files) > 1: - text_total = wl_texts.Wl_Text_Blank() - text_total.tokens_multilevel = [ - copy.deepcopy(para) - for text in texts - for para in text.tokens_multilevel - ] + texts.append(wl_texts.Wl_Text_Total(texts)) # Frequency self.tokens_freq_files.append(sum(self.tokens_freq_files, collections.Counter())) - texts.append(text_total) - # Dispersion & Adjusted Frequency measure_dispersion = settings['generation_settings']['measure_dispersion'] measure_adjusted_freq = settings['generation_settings']['measure_adjusted_freq'] @@ -700,7 +696,7 @@ def run(self): self.err_msg, wl_misc.merge_dicts(self.tokens_freq_files), wl_misc.merge_dicts(self.tokens_stats_files), - self.tokens_syllabification + self.syls_tokens ) class Wl_Worker_Wordlist_Generator_Fig(Wl_Worker_Wordlist_Generator): @@ -712,5 +708,5 @@ def run(self): self.err_msg, wl_misc.merge_dicts(self.tokens_freq_files), wl_misc.merge_dicts(self.tokens_stats_files), - self.tokens_syllabification + self.syls_tokens )