Skip to content

Commit b571366

Browse files
authored
Merge pull request #2 from Edinburgh-Genome-Foundry/dev
Biopython fix + protein sequence support
2 parents 0940ba7 + b5f4f05 commit b571366

File tree

12 files changed

+308
-164
lines changed

12 files changed

+308
-164
lines changed

LICENCE.txt

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
The MIT License (MIT)
2-
[OSI Approved License]
3-
4-
The MIT License (MIT)
1+
MIT License
52

63
Copyright (c) 2018 Edinburgh Genome Foundry
74

@@ -12,13 +9,13 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
129
copies of the Software, and to permit persons to whom the Software is
1310
furnished to do so, subject to the following conditions:
1411

15-
The above copyright notice and this permission notice shall be included in
16-
all copies or substantial portions of the Software.
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
1714

1815
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1916
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2017
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
2118
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2219
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24-
THE SOFTWARE.
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.rst

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,27 +71,40 @@ To write the sequences down as Genbank records, with annotations:
7171
from crazydoc import records_to_genbank
7272
records_to_genbank(biopython_records)
7373
74+
Note that ``records_to_genbank()`` will truncate the record name to 20 characters,
75+
to fit in the GenBank format. Additionally, slashes (``/``) will be replaced with
76+
hyphens (``-``) in the filenames. To read protein sequences, pass ``is_protein=True``:
77+
78+
.. code:: python
79+
80+
biopython_records = parse_doc_file(protein_path, is_protein=True)
81+
82+
This will return *protein* records, which will be saved with a GenPept extension
83+
(.gp) by ``records_to_genbank(biopython_records, is_protein=True)``,
84+
unless specified otherwise with ``extension=``.
85+
86+
7487
Installation
75-
-------------
88+
------------
7689

77-
(soon) You can install crazydoc through PIP
90+
You can install crazydoc through PIP:
7891

7992
.. code::
8093
8194
sudo pip install crazydoc
8295
83-
Alternatively, you can unzip the sources in a folder and type
96+
Alternatively, you can unzip the sources in a folder and type:
8497

8598
.. code::
8699
87100
sudo python setup.py install
88101
89102
License = MIT
90-
--------------
103+
-------------
91104

92-
Crazydoc is an open-source software originally written at the `Edinburgh Genome Foundry <http://genomefoundry.org>`_ by `Zulko <https://github.com/Zulko>`_ and `released on Github <https://github.com/Edinburgh-Genome-Foundry/crazydoc>`_ under the MIT licence (copyright Edinburg Genome Foundry).
105+
Crazydoc is an open-source software originally written at the `Edinburgh Genome Foundry <http://genomefoundry.org>`_ by `Zulko <https://github.com/Zulko>`_ and `released on Github <https://github.com/Edinburgh-Genome-Foundry/crazydoc>`_ under the MIT licence (Copyright 2018 Edinburgh Genome Foundry).
93106

94-
Everyone is welcome to contribute !
107+
Everyone is welcome to contribute!
95108

96109
More biology software
97110
---------------------

crazydoc/CrazydocParser.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
from docx import Document
22

3-
from .Observers import (HighlightColor, FontColor, Bold, Italic, UpperCase,
4-
LowerCase, Underline)
3+
from .Observers import (
4+
HighlightColor,
5+
FontColor,
6+
Bold,
7+
Italic,
8+
UpperCase,
9+
LowerCase,
10+
Underline,
11+
)
512
from .biotools import string_is_sequence
613

714

@@ -22,27 +29,33 @@ class CrazydocParser:
2229
``highlight_color``, ``font_color``, ``bold``, ``italic``,
2330
``upper_case``, ``lower_case``, ``underline``.
2431
"""
32+
2533
observers_dict = {
2634
_class.name: _class()
27-
for _class in (HighlightColor, FontColor, Bold, Italic, UpperCase,
28-
LowerCase, Underline)
35+
for _class in (
36+
HighlightColor,
37+
FontColor,
38+
Bold,
39+
Italic,
40+
UpperCase,
41+
LowerCase,
42+
Underline,
43+
)
2944
}
3045

3146
def __init__(self, observers):
3247
self.observers = [
33-
self.observers_dict[o] if isinstance(o, str) else o
34-
for o in observers
48+
self.observers_dict[o] if isinstance(o, str) else o for o in observers
3549
]
3650

37-
38-
def _extract_sequence_names_and_runs(self, doc):
51+
def _extract_sequence_names_and_runs(self, doc, is_protein=False):
3952
"""Parse the doc, return a list [(sequence_name, sequenceruns), ...]"""
4053
sequence_name = None
4154
sequence_paragraphs = []
4255
reading_sequence = False
4356
for paragraph in doc.paragraphs:
4457
stripped = paragraph.text.replace(" ", "")
45-
if string_is_sequence(stripped):
58+
if string_is_sequence(stripped, is_protein=is_protein):
4659
if reading_sequence:
4760
sequence_paragraphs[-1][1].append(paragraph)
4861
else:
@@ -52,18 +65,17 @@ def _extract_sequence_names_and_runs(self, doc):
5265
if reading_sequence:
5366
sequence_name = None
5467
reading_sequence = False
55-
if paragraph.text.startswith('>'):
68+
if paragraph.text.startswith(">"):
5669
sequence_name = paragraph.text[1:].strip()
57-
sequence_paragraphs
5870
return [
5971
(name, [run for par in paragraphs for run in par.runs])
6072
for name, paragraphs in sequence_paragraphs
6173
]
6274

63-
def _msword_runs_to_record(self, runs):
75+
def _msword_runs_to_record(self, runs, is_protein=False):
6476
"""Transform a MS Word runs list to a biopython record."""
6577
records = [
66-
observer.msword_runs_to_record(runs)
78+
observer.msword_runs_to_record(runs, is_protein=is_protein)
6779
for observer in self.observers
6880
]
6981
final_record = records[0]
@@ -82,29 +94,32 @@ def _msword_runs_to_record(self, runs):
8294
record_features[location] = feature
8395
return final_record
8496

85-
86-
87-
def parse_doc_file(self, filepath=None, doc=None):
97+
def parse_doc_file(self, filepath=None, doc=None, is_protein=False):
8898
"""Return a list of records, 1 for each sequence contained in the docx.
8999
90100
Parameters
91101
----------
92102
93103
filepath
94-
A path to a docx file
104+
A path to a docx file.
95105
96106
doc
97107
A python-docx Document object, which can be provided instead of the
98108
file path.
109+
110+
is_protein
111+
True if the sequences are protein sequences (default: False).
99112
"""
100113
if doc is None:
101114
doc = Document(filepath)
102115
records = []
103-
for name, runs in self._extract_sequence_names_and_runs(doc):
104-
record = self._msword_runs_to_record(runs)
116+
for name, runs in self._extract_sequence_names_and_runs(
117+
doc, is_protein=is_protein
118+
):
119+
record = self._msword_runs_to_record(runs, is_protein=is_protein)
105120
if name is not None:
106121
record.id = name
107-
record.name = name.replace(' ', '_')
122+
record.name = name.replace(" ", "_")
108123
for observer in self.observers:
109124
observer.process_record_features(record)
110125
records.append(record)

crazydoc/Observers.py

Lines changed: 39 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from .conf import conf
22
from .biotools import sequence_to_record, sequence_to_annotated_record
33

4+
45
class StyleObserver:
56
"""Generic class for observing style-based annotations in sequences.
67
7-
The subclasses observe each one particular type of DNA sequence annotation
8-
such as the highlight color, bold text, underlines, etc.
9-
8+
The provided subclasses each observe one particular type of DNA sequence
9+
annotation, such as the highlight color, bold text, underlines, etc.
1010
"""
11+
1112
def __init__(self):
1213
pass
1314

@@ -19,31 +20,33 @@ def process_feature(self, feature):
1920
if self.name not in feature.qualifiers:
2021
return
2122
value = feature.qualifiers[self.name]
22-
label = ''
23-
if 'label' in feature.qualifiers:
24-
label = feature.qualifiers['label'] + '; '
23+
label = ""
24+
if "label" in feature.qualifiers:
25+
label = feature.qualifiers["label"] + "; "
2526
label += self.name
2627
if not isinstance(value, bool):
2728
label += ": " + str(value)
28-
feature.qualifiers['label'] = label
29+
feature.qualifiers["label"] = label
2930

3031
def aggregate_features_from_runs(self, runs):
31-
features = [[None, '']]
32+
features = [[None, ""]]
3233
for run in runs:
3334
value = self.evaluate(run)
34-
text = run.text.replace(' ', '')
35+
text = run.text.replace(" ", "")
3536
if value == features[-1][0]:
3637
features[-1][1] += text
3738
else:
3839
features.append([value, text])
3940
return features
4041

41-
def msword_runs_to_record(self, runs):
42+
def msword_runs_to_record(self, runs, is_protein=False):
4243
feature_records = [
4344
(
44-
sequence_to_annotated_record(text, **{self.name: val})
45+
sequence_to_annotated_record(
46+
text, is_protein=is_protein, **{self.name: val}
47+
)
4548
if val
46-
else sequence_to_record(text)
49+
else sequence_to_record(text, is_protein=is_protein)
4750
)
4851
for (val, text) in self.aggregate_features_from_runs(runs)
4952
]
@@ -60,16 +63,16 @@ def process_feature(self, feature):
6063
if self.name not in feature.qualifiers:
6164
return
6265
color = feature.qualifiers[self.name]
63-
for field in ['color', 'ApEinfo_revcolor', 'ApEinfo_fwdcolor']:
66+
for field in ["color", "ApEinfo_revcolor", "ApEinfo_fwdcolor"]:
6467
feature.qualifiers[field] = color
6568

6669

6770
class CharactersObserver(StyleObserver):
6871
"""Subclass for character-by-character observers."""
6972

7073
def aggregate_features_from_runs(self, runs):
71-
features = [[None, '']]
72-
text = ''.join([r.text for r in runs])
74+
features = [[None, ""]]
75+
text = "".join([r.text for r in runs])
7376
for character in text:
7477
value = self.evaluate(character)
7578
if value == features[-1][0]:
@@ -81,7 +84,8 @@ def aggregate_features_from_runs(self, runs):
8184

8285
class Italic(StyleObserver):
8386
"""Captures italic text."""
84-
name = 'italic'
87+
88+
name = "italic"
8589

8690
def evaluate(self, run):
8791
"""Return whether the run has italic style"""
@@ -90,15 +94,18 @@ def evaluate(self, run):
9094

9195
class Bold(StyleObserver):
9296
"""Captures bold text."""
93-
name = 'bold'
97+
98+
name = "bold"
9499

95100
def evaluate(self, run):
96101
"""Return whether the run has bold style"""
97102
return run.bold
98103

104+
99105
class Underline(StyleObserver):
100106
"""Captures underlined text."""
101-
name = 'underline'
107+
108+
name = "underline"
102109

103110
def evaluate(self, run):
104111
"""Return whether the run has underline style"""
@@ -107,42 +114,47 @@ def evaluate(self, run):
107114

108115
class FontColor(ColorObserver):
109116
"""Captures text with non-black font color."""
110-
name = 'font_color'
117+
118+
name = "font_color"
111119

112120
def evaluate(self, run):
113121
"""Return False if no color, else the #ae60bf color."""
114122
color = str(run.font.color.rgb)
115-
if color in ['None', '000000']:
123+
if color in ["None", "000000"]:
116124
return False
117125
else:
118126
return "#" + color
119127

120128

121129
class HighlightColor(ColorObserver):
122130
"""Captures text with a background-highlighting color."""
123-
name = 'highlight_color'
131+
132+
name = "highlight_color"
124133

125134
def evaluate(self, run):
126135
"""Return False if no background color, else the #ae60bf color."""
127136
color = run.font.highlight_color
128137
if color is None:
129138
return False
130139
else:
131-
return conf['color_theme'][color._member_name]
140+
return conf["color_theme"][color._member_name]
141+
132142

133143
class UpperCase(CharactersObserver):
134144
"""Captures upper-case text."""
135-
name = 'upper_case'
145+
146+
name = "upper_case"
136147

137148
def evaluate(self, character):
138149
"""Return whether the character is upper"""
139-
return (character == character.upper())
150+
return character == character.upper()
140151

141152

142153
class LowerCase(CharactersObserver):
143154
"""Captures lower-case text."""
144-
name = 'lower_case'
145155

146-
def evaluate(self, character):#
156+
name = "lower_case"
157+
158+
def evaluate(self, character): #
147159
"""Return whether the character is lower"""
148-
return (character == character.lower())
160+
return character == character.lower()

0 commit comments

Comments
 (0)