-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcopyout.py
228 lines (199 loc) · 9.59 KB
/
copyout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from db_connection import DBConnection
import os
from shutil import copyfile
from utils_mixin import Utils
from doi_database import DoiDatabase
import logging
class CopyOut(Utils):
def __init__(self, year, config):
self.year = year
self.config = config
def get_matches(self):
"""executes query to retrieve rows from two tables -
matches, and doi - and keeps the row if doi exists in both
table and 'ignore' column in 'matches' table is 0.
:return: The query results containing matched rows.
:rtype: list
"""
sql = f"""select matches.doi,
matches.collection,
dois.full_path,
dois.published_date,
dois.journal_title,
matches.date_added,
matches.notes,
matches.digital_only
from matches, dois where matches.doi = dois.doi and
matches.skip != 1 and
dois.{self.sql_year_restriction(self.year, self.year)} order by collection,dois.published_date"""
results = DBConnection.execute_query(sql)
return results
def get_textfile_path(self, doi):
sql = "SELECT textfile_path FROM collections_papers.scans WHERE doi = %s"
result = DBConnection.execute_query(sql, (doi,))
return result[0][0] if result and result[0][0] is not None else None
def make_target_dir(self, dest_dir, collection):
"""Creates directories that are 3 layers deep,
'publish', defined in config.ini -> collection -> year
:param dest_dir: directory (where main.py is stored)
:type dest_dir: str
:param collection: collections directory, i.e. botany, within dest_dir
:type collection: str
:return: year directory within collection_dir
:rtype: str
"""
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
collection_dir = dest_dir + f"/{collection}"
if not os.path.exists(collection_dir):
os.makedirs(collection_dir)
year_dir = collection_dir + f"/{self.year}"
if not os.path.exists(year_dir):
os.makedirs(year_dir)
return year_dir
def _copy_out_file(self, origin_path, collection, dest_dir):
"""Copies an origin file to the destination directory based on the given collection.
:param origin_path: The path of the origin file to be copied.
:type origin_path: str
:param collection: The name of the collection directory to copy the file into.
:type collection: str
:param dest_dir: The destination directory where the file will be copied.
:type dest_dir: str
"""
target_dir = self.make_target_dir(dest_dir, collection)
target = target_dir + f"/{os.path.basename(origin_path)}"
if not os.path.exists(target):
if target.endswith('.pdf'):
print(f"New Paper from {origin_path}")
elif target.endswith('.txt'):
print(f"New Text from {origin_path}")
copyfile(origin_path, target)
else:
if target.endswith('.pdf'):
print(f"Paper Already Copied from {origin_path}")
elif target.endswith('.txt'):
print(f"Text Already Copied from {origin_path}")
def generate_file_path(self, doi, file_type='txt'):
"""Generates the full file path for text or PDF files based on DOI.
:param doi: The DOI of the paper for which to generate the file path.
:type doi: str
:param file_type: The type of file to generate path for ('txt' or 'pdf').
:type file_type: str
:return: The generated full file path to the file.
:rtype: str
"""
# Query to get the issn and published year based on the DOI
sql = f"SELECT issn, YEAR(published_date) FROM collections_papers.dois WHERE doi='{doi}'"
results = DBConnection.execute_query(sql)
if len(results) > 0:
issn, year = results[0]
# Get the base directory from the config depending on file type
if file_type == 'pdf':
base_path = self.config.get_string("downloaders", "pdf_directory")
else: # default to text file directory
base_path = self.config.get_string("scan", "scan_text_directory")
# Normalize DOI for filename usage
normalized_doi = doi.replace('/', '_')
# Constructing the file path
file_path = os.path.join(base_path, issn, str(year), f"{normalized_doi}.{file_type}")
return file_path
else:
print(f"No results for doi {doi}!!! : {sql}")
return None
def copy_out_files(self, dest_dir="./"):
"""Copies files based on the matched rows retrieved from the
database, 'matches' table.
Skips copying if the 'digital_only' flag is True.
:param dest_dir: The destination directory where the files will be copied to, defaults to "./"
:type dest_dir: str, optional
"""
for cur_match in self.get_matches():
doi = cur_match[0]
collection = cur_match[1]
origin_path = self.generate_file_path(doi,"pdf")
# digital_only = bool(cur_match[7])
# if digital_only is True:
# print("Digital only")
# continue
self._copy_out_file(origin_path, collection, dest_dir)
if self.config.get_boolean("copyout", "copyout_txt"):
text_path = self.generate_file_path(doi,"txt")
if os.path.exists(text_path):
self._copy_out_file(text_path, collection, dest_dir)
else:
print(f" Missing text file: {text_path}")
def write_match(self, cur_match, filehandle, db):
"""Write a matched data record to a file.
The matached data record has 2 parts. The first part includes DOI.
The second part is "identifier" from "matched_collectin_ids" table in
the databse (example: cas 229582)
:param cur_match: A tuple containing matched data fields, including DOI, collection, origin_path,published_date, journal_title, date_added, notes, and digital_only.
:type cur_match: tuple
:param filehandle: The file handle where the matched data will be written.
:type filehandle: file
:param db: An instance of the DoiDatabase class used for retrieving DOI record information.
:type db: DoiDatabase
"""
doi = cur_match[0]
doi_record = db.get_doi(doi)
collection = cur_match[1]
origin_path = cur_match[2]
published_date = cur_match[3]
journal_title = cur_match[4]
date_added = cur_match[5]
notes = cur_match[6]
digital_only = cur_match[7]
title = self.clean_string(doi_record.get_title())
write_string = f"{doi}\t{collection}\t{journal_title}\t{title}\t{published_date}\t{date_added}\t{notes}\t{digital_only}"
write_string = write_string.replace('None', '-')
filehandle.write(write_string)
sql = f"select identifier from matched_collection_ids where matched_collection_ids.doi='{doi}'"
results = DBConnection.execute_query(sql)
if len(results) > 0:
filehandle.write("\t")
for count, result in enumerate(results):
result = result[0]
filehandle.write(result)
if count < len(results) - 1:
filehandle.write(",")
filehandle.write("\n")
def dump_file_tsv(self, path="./"):
"""Dump the matched data to a TSV file.
:param path: The directory path where the TSV file will be saved. Defaults to "./".
:type path: str, optional
"""
if not os.path.exists(path):
os.makedirs(path)
db = DoiDatabase(self.config)
filename = f"{path}/matched_{self.year}.tsv"
fh = open(filename, "w")
fh.write("doi\tcollection\tjournal_title\ttitle\tpublished_date\tdate_added\tnotes\tdigital_only\n")
for cur_match in self.get_matches():
self.write_match(cur_match, fh, db)
def dump_custom(self, special_note_string, path):
"""Dump matched data (from found_scan_lines table in databsse)
with a special note to a TSV file.
:param special_note_string: A string representing the special note to filter matched data. These include: "antcat", "antweb", "inaturalist", "catalogue of fishes"
:type special_note_string: str
:param path: The directory path where the TSV file will be saved.
:type path: str
"""
if not os.path.exists(path):
os.makedirs(path)
db = DoiDatabase(self.config)
filename = f"{path}/matched_{self.year}_{special_note_string}.tsv"
fh = open(filename, "w")
fh.write("doi\tcollection\tjournal_title\ttitle\tpublished_date\tdate_added\tnotes\tdigital_only\n")
for cur_match in self.get_matches():
sql = f"select line from found_scan_lines where doi = '{cur_match[0]}'"
scan_db_results = DBConnection.execute_query(sql)
found_special_note = False
for line_array in scan_db_results:
line = line_array[0]
if special_note_string in line.lower():
found_special_note = True
break
if found_special_note:
self.write_match(cur_match, fh, db)
origin_path = cur_match[2]
self._copy_out_file(origin_path, special_note_string, path)