-
Notifications
You must be signed in to change notification settings - Fork 0
/
dump_articles.py
89 lines (79 loc) · 2.66 KB
/
dump_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import subprocess
import json
import pandas as pd
import numpy as np
import pymongo
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--mongo_host", default="localhost", dest="mongo_host")
parser.add_argument("--mongo_port", default=27017)
args = parser.parse_args()
articles = pymongo.MongoClient(args.mongo_host, args.mongo_port).pmc.articles
with open("terms") as f:
terms = [line.strip() for line in f.readlines()]
simple_query = {
"text_matches": {"$in": terms},
"article_meta.article_type": "research-article",
"article_meta.has_body": True,
"article_meta.n_nonparen_geospans": {"$exists": True},
"article_meta.nonparen_geospan_density": {"$exists": True},
}
cursor = articles.aggregate(
[
{"$match": simple_query},
{
"$project": {
"n_nonparen_geospans": "$article_meta.n_nonparen_geospans",
"nonparen_geospan_density": "$article_meta.nonparen_geospan_density",
"length": {"$strLenCP": "$extracted_text"},
}
},
]
)
subset_df = pd.DataFrame(cursor)
quantiles = subset_df.quantile(q=[0.01, 0.25, 0.95, 0.99])
density = subset_df["nonparen_geospan_density"]
density = density[density != 0]
log_density = np.log(density)
threshold_density = np.exp(np.mean(log_density) - np.std(log_density))
final_query = {
"text_matches": {"$in": terms},
"article_meta.article_type": "research-article",
"article_meta.has_body": True,
"article_meta.n_nonparen_geospans": {
"$gt": quantiles.n_nonparen_geospans.iloc[0],
"$lte": quantiles.n_nonparen_geospans.iloc[3],
},
"article_meta.nonparen_geospan_density": {"$gt": threshold_density},
"$and": [
{
"$expr": {
"$gt": [{"$strLenCP": "$extracted_text"}, quantiles.length.iloc[0]]
}
},
{
"$expr": {
"$lte": [{"$strLenCP": "$extracted_text"}, quantiles.length.iloc[2]]
}
},
],
}
print(
"Dumping {} articles matching this query:\n\n{}".format(
articles.count_documents(final_query), json.dumps(final_query, indent=4)
)
)
subprocess.run(
[
"mongodump",
"--gzip",
"--archive=ai4e_articles.gzip",
"-d",
"pmc",
"-c",
"articles",
"-q",
json.dumps(final_query),
]
)