-
Notifications
You must be signed in to change notification settings - Fork 0
/
taxon_parse_utils.py
67 lines (54 loc) · 2.51 KB
/
taxon_parse_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""Docstring: This is a utility file, outlining various useful functions to be used
for parsing taxonomic nomenclature
"""
import pandas as pd
import re
pd.set_option('expand_frame_repr', False)
# taxon parsing tools, tools that modify or parse taxon columns and info,
def separate_qualifiers(tax_frame: pd.DataFrame, tax_col: str):
"""seperate_qualifiers: separates out the parsed taxa and the cf qualifier into new columns
qualifier to be stored in new 'qualifier' column.
args:
tax_frame: dataframe containing taxon string column, from which qualifiers need to be parsed
tax_col: the name of the tax column which we want to parse.
returns:
tax_frame: a dataframe with new qualifier column parsed from tax column.
"""
tax_frame['qualifier'] = pd.NA
qual_regex = ['cf.', 'aff.', 'vel aff.', 'sec.']
for qual in qual_regex:
cf_mask = tax_frame[tax_col].str.contains(f"{qual}")
if len(cf_mask) > 0:
# setting default to species qualifier
tax_frame.loc[cf_mask, 'qualifier'] = qual
# removing trailing whitespace
tax_frame['qualifier'] = tax_frame['qualifier'].str.strip()
tax_frame[tax_col] = tax_frame[tax_col].apply(remove_qualifiers)
return tax_frame
def remove_qualifiers(tax_string: str):
"""remove_qualifiers: removes qualifiers such as cf. or aff. from any taxon string.
args:
tax_string: string of taxon name , which one wants to remove qualifiers from.
returns:
tax_string: a string without qualifier substrings present.
"""
# works better when entries space + qual ordered first, vel aff before aff
qual_list = [" cf.", "cf.", " vel aff.", "vel aff.", " aff.", "aff.", " sec.", "sec."]
for qual_str in qual_list:
tax_string = tax_string.replace(qual_str, "")
return tax_string
def extract_after_subtax(text):
"""extract_after_subtax: will take any substring after a subtaxa/intrataxa rank pattern,
and stores it in a variable. Useful for parsing taxon names
args:
text: the verbatim taxon name that will be parsed.
returns:
extracted_text: substring after subtaxa rank"""
patterns = ["subsp\\.", "var\\.", "subvar\\.", "f\\.", "subform\\."]
for pattern in patterns:
match = re.search(pattern, text)
if match:
start = match.end()
extracted_text = text[start:].strip()
return extracted_text
return None