forked from ericwhyne/darpa_open_catalog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MEMEX-data.json
executable file
·182 lines (182 loc) · 7.3 KB
/
MEMEX-data.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
[
{
"DARPA Program": "MEMEX",
"Data Set Name": "Ebola TREC DD Raw PDF",
"Description": "The PDFs contained within the directory were crawled from the following sites using Apache Nutchi 1.10-SNAPSHOT. http://www.who.int/en, http://fts.unocha.org, http://www.worldbank.org/en/topic/ebola, http://apps.who.int/ebola/en/current-situation/ebola-situation-report The document corpus represents a dump of ONLY PDFs from the above sites with the aim of capturing a corpus representing sentiment focusing on the Ebola Crisis domain.",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"Ebola"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Ebola TREC DD PDF Solr Index",
"Description": "Provides a Solr index representation of the data described as 'The PDFs contained within the directory were crawled from the following sites using Apache Nutchi 1.10-SNAPSHOT. http://www.who.int/en, http://fts.unocha.org, http://www.worldbank.org/en/topic/ebola, http://apps.who.int/ebola/en/current-situation/ebola-situation-report The document corpus represents a dump of ONLY PDFs from the above sites with the aim of capturing a corpus representing sentiment focusing on the Ebola Crisis domain.'",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"Ebola"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Florida District Court of Appeals",
"Description": "4130 PDF documents (1st - 5th Districts dating back to 2007) - https://crawl-jpl.s3.amazonaws.com/florida_state_appeals_court.zip",
"Number of Rows": "4130",
"Number of Columns": "",
"Industry": [
"Public Criminal Law"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "District of Columbia Court of Appeals",
"Description": "375 PDF documents (compounded documents) - https://crawl-jpl.s3.amazonaws.com/district_of_colombia.zip",
"Number of Rows": "375",
"Number of Columns": "",
"Industry": [
"Public Criminal Law"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "New York State 1st District Court of Appeals",
"Description": "1798 PDF documents - https://crawl-jpl.s3.amazonaws.com/new_york_state_appellate_1st_div.zip",
"Number of Rows": "1798",
"Number of Columns": "",
"Industry": [
"Public Criminal Law"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "New York State 2nd District Court of Appeals",
"Description": "8745 PDF documents - https://crawl-jpl.s3.amazonaws.com/new_york_state_appellate_2nd_div.zip",
"Number of Rows": "8745",
"Number of Columns": "",
"Industry": [
"Public Criminal Law"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "New York State 4th District Court of Appeals",
"Description": "85 (compounded documents) - https://crawl-jpl.s3.amazonaws.com/new_york_state_appellate_4th_div.zip",
"Number of Rows": "85",
"Number of Columns": "",
"Industry": [
"Public Criminal Law"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Escort Sites",
"Description": "Multiple crawls of various escort sites as defined by this seed list: https://memexproxy.com/wiki/display/MPM/Web+Sites+For+Scraping. Data Set is available at: http://memex.dyndns.org:8983/solr/",
"Number of Rows": "1019923",
"Number of Columns": "",
"Industry": [
"Human Trafficking"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Image Catalog",
"Description": "Annotated version of the Roxyimages 32M+ image dataset containing all extracted EXIF metadata (camera serial number, etc.) by Tika, as well as OCR image to text extraction from Tika/Tesseract.",
"Number of Rows": "31508592",
"Number of Columns": "60+",
"Industry": [
"Human Trafficking"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Ebola news articles in html. (ebola_01_12_2015.tar.gz)",
"Description": "This dataset contains 328k articles about Ebola in html.",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"News Media"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Paper abstracts from Journal Cambridge in html. (abstracts.tar.gz)",
"Description": "This dataset contains abstracts and metadata (title, authors, affiliations) of 69k papers (1980-2014) crawled from journals.cambridge.org.",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"Material Sciences Literature"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Papers (fulltext) from Journal Cambridge in PDF",
"Description": "This dataset contains 67k full-text papers (1980-2014) in PDF format, crawled from journals.cambridge.org.",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"Material Sciences Literature"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Papers (fulltext) from Journal Cambridge in PDF",
"Description": "This dataset contains 67k full-text papers extracted from PDF version.",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"Material Sciences Literature"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Papers (fulltext) from Nature Materials Journal in html",
"Description": "This dataset contains 2771 full-text papers (2002 to 2015) in html format crawled from http://www.nature.com/nmat/archive/index.html.",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"Material Sciences Literature"
],
"New Date": "20150410",
"Update Date": ""
},
{
"DARPA Program": "MEMEX",
"Data Set Name": "Papers (fulltext) from Nature Materials Journal in PDF",
"Description": "This dataset contains 2771 full-text papers (2002 to 2015) in PDF format crawled from http://www.nature.com/nmat/archive/index.html.",
"Number of Rows": "",
"Number of Columns": "",
"Industry": [
"Material Sciences Literature"
],
"New Date": "20150410",
"Update Date": ""
}
]