-
Notifications
You must be signed in to change notification settings - Fork 0
/
greek_scansion.py
197 lines (173 loc) · 7.25 KB
/
greek_scansion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding: utf-8 -*-
"""
This program takes a line of Greek text and returns its scansion as a list of
syllable markers: LONG ('L'), SHORT ('S'), or UNKNOWN ('X'). The module is
appropriate for analyzing prose or poetry.
The current version incorporates the syllabify module from James Tauber's
greek_accentuation library, but it could easily be modified to work with any
type of syllabifier, since it analyzes consonants between vowel clusters without
reference to their assignment to the current or subsequent syllable.
Next steps for improvement are to increase the program's ability to recognize
vowels that are long or short by nature:
1. Create a tool to macronize known suffixes (i.e. syntactical endings).
2. Scrape the LSJ to build a macronized dictionary of ambiguous words.
@author: Anna Conser, Columbia University, [email protected]
@license: MIT
"""
import re
import unicodedata
from greek_accentuation.syllabify import syllabify
# CHARACTER DEFINITIONS
VOWELS = ['α', 'ε', 'η', 'ι', 'ο', 'υ', 'ω']
vowel_re = re.compile('[' + ''.join(VOWELS) + ']')
LONG_VOWELS = ['η', 'ω']
long_vowel_re = re.compile('[' + ''.join(LONG_VOWELS) + ']')
SHORT_VOWELS = ['ε', 'ο']
short_vowel_re = re.compile('[' + ''.join(SHORT_VOWELS) + ']')
DIPHTHONGS = ['αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου', 'ηυ']
diphthong_re = re.compile('(' + '|'.join(DIPHTHONGS)+ ')')
CONSONANTS = ['β', 'γ', 'δ', 'ζ', 'θ', 'κ', 'λ', 'μ', 'ν', 'ξ',
'π', 'ρ', 'ϲ', 'σ', 'ς', 'τ', 'φ', 'χ', 'ψ'
]
consonant_re = re.compile('[' + ''.join(CONSONANTS) + ']')
DOUBLE_CONS = ['ζ', 'ξ', 'ψ']
double_con_re = re.compile('[' + ''.join(DOUBLE_CONS) + ']')
LONG_MARKS = [u'\u0302', #combining circumflex
u'\u0304', #macron
u'\u02C6', #another circumflex
u'\u005E', #caret
u'\u0345', #iota subscript
u'\u037A' #another iota subscript
]
long_mark_re = re.compile('['+ ''.join(LONG_MARKS)+']')
SHORT_MARK = u"\u0306"
MUTE_LIQUID = ['θλ', 'θρ', 'θμ', 'θν', #voiceless stop + liquid or nasal
'κλ', 'κρ', 'κμ', 'κν',
'πλ', 'πρ', 'πν', 'πμ',
'τλ', 'τρ', 'τν', 'τμ',
'φλ', 'φρ', 'φν', 'φμ',
'χλ', 'χρ', 'χν', 'χμ',
'βρ', 'γρ', 'δρ' #voiced stop + rho
]
# NOTE: the following stop+liquid combinations have been excluded as they almost
# universally make a syllable long by position:
# long_stop_liquids = ['γμ', 'γν', 'δμ', 'δν', 'βλ', 'γλ']
APOSTROPHE = u'\u02BC'
PUNCTUATION = ".,;·:'<>[]{}()=+"
LONG = 'L'
SHORT = 'S'
UNKNOWN = 'X'
#INTERNAL FUNCTIONS
def strip_str (string):
"""Removes diacritical markings from a string and makes lowercase.
:param str string: a string of text
:return: a string of text without diacritical markings
:rtype: str
"""
bare_str = ''
for ch in string:
bare_ch = unicodedata.normalize('NFD', ch)[0]
bare_str += bare_ch
bare_str = bare_str.lower()
return bare_str
def alnum_syl (string):
"""Removes everything except alphanumeric characters from a syllable,
including apostrophes indicating apocope. Also makes lowercase.
"""
text = string
text = re.sub(r'\W+', '', text)
text = re.sub(APOSTROPHE, '', text)
text = text.lower()
return text
def natural_length (syl):
"""Checks the natural length of syllable, returning LONG, SHORT, or UNKNOWN.
-- LONG is identified by long vowels, diphthongs, or long diacritics.
-- SHORT is identified by short vowels or a short mark.
-- UNKNOWN is α, ι, or υ without diacritics indicating length.
"""
bare_syl = strip_str(syl) #remove diacritics from vowels
split_chs = unicodedata.normalize('NFD', syl) #get list including diacritics
if re.search(long_vowel_re, bare_syl): #eta and omega
status = LONG
elif re.search(diphthong_re, bare_syl): #diphthongs
status = LONG
elif re.search(long_mark_re, split_chs): #circumflex, iota subscript, marcon
status = LONG
elif re.search(short_vowel_re, bare_syl): #epsilon, omicron
status = SHORT
elif SHORT_MARK in split_chs: #short mark
status = SHORT
else: #alpha, upsilon, iota without length diacritic
status = UNKNOWN
return status
def positional_length (syl, next_syl):
"""Takes the two sequential syllables and returns the positional length of
the first. Syllables can be created using any tool that separates vowel
clusters. Stop + liquid rules follow the principles that apply in tragedy
(see notes below).
"""
#Check for wordbreak
if re.search(r'\s', syl[-1]+next_syl[0]):
wordbreak = True
else: wordbreak = False
#Clean extraneous markings from the ends of syllables
syl = alnum_syl(syl)
next_syl = alnum_syl(next_syl)
#Create a list of the consonants separating the two syls' vowel-clusters
consonant_cluster = ''
#Add consonants post-vowel in the first syl
for ch in syl[::-1]:
if ch in CONSONANTS:
consonant_cluster = ch + consonant_cluster
else:
break
#Add consonants pre-vowel in the following syl
for ch in next_syl:
if ch in CONSONANTS:
consonant_cluster +=ch
else:
break
#Check for lengthening conditions:
#Followed by a single consonant = SHORT, unless it's a double consonant
if len(consonant_cluster) <=1:
if re.search(double_con_re, consonant_cluster):
status = LONG
else:
status = SHORT
#Two or more consonants lengthen, except certain stop+liquid combos. If
#these occur after a wordbreak, first syl is short; but within a word they
#can lengthen or not as required in context.
else:
if consonant_cluster in MUTE_LIQUID:
if wordbreak == True and syl[-1] in VOWELS:
status = SHORT
else:
status = UNKNOWN
else:
status = LONG
#short syl can function as long at line end (final anceps)
if next_syl == 'END' and status == SHORT:
status = UNKNOWN
return status
#PRIMARY FUNCTION
def scan_line (line):
"""Scans a line of Greek poetry, as best as possible without knowing
whether ambiguous vowels are long by nature. Returns the meter as a list of
syllables marked LONG ('L'), SHORT ('S') or UNKNOWN ('X').
"""
line = line.strip()
meter = []
syl_list = syllabify(line)
for i, syl in enumerate(syl_list):
current_length = natural_length(syl)
if current_length != 'L':
if i == len(syl_list)-1:
next_syl = 'END'
else:
next_syl = syl_list[i+1]
pos_len = positional_length(syl, next_syl)
if pos_len == LONG: current_length = LONG
elif current_length == SHORT:
current_length = pos_len
meter.append(current_length)
return meter