Skip to content
This repository was archived by the owner on May 19, 2023. It is now read-only.

Commit 5ab51cc

Browse files
authored
Update Rapids datasets download URL (#527)
Update Rapids datasets download URL to reduce latency and costs. This PR also replace the usage of `s3fs` by `requests` to get Rapids datasets as we are not using an S3 URL anymore
1 parent 4628d0c commit 5ab51cc

14 files changed

+59
-57
lines changed

README.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,14 @@ CLX is targeted towards cybersecurity data scientists, senior security analysts,
173173

174174
```python
175175
import cudf
176-
import s3fs
176+
import requests
177177
from os import path
178178

179179
# download data
180-
if not path.exists("./splunk_faker_raw4"):
181-
fs = s3fs.S3FileSystem(anon=True)
182-
fs.get("rapidsai-data/cyber/clx/splunk_faker_raw4", "./splunk_faker_raw4")
180+
if not path.exists('./splunk_faker_raw4'):
181+
url = 'https://data.rapids.ai/cyber/clx/splunk_faker_raw4'
182+
r = requests.get(url)
183+
open('./splunk_faker_raw4', 'wb').write(r.content)
183184

184185
# read in alert data
185186
gdf = cudf.read_csv('./splunk_faker_raw4')

examples/forest_inference/FIL_custreamz_pipeline.ipynb

+6-6
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@
2525
"metadata": {},
2626
"outputs": [],
2727
"source": [
28-
"import s3fs\n",
28+
"import requests\n",
2929
"from os import path\n",
3030
"\n",
3131
"# Download sample data and model\n",
3232
"IOT_MALWARE_JSON=\"iot_malware_1_1.json\"\n",
3333
"IOT_XGBOOST_MODEL=\"iot_xgboost_model.bst\"\n",
34-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\""
34+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\""
3535
]
3636
},
3737
{
@@ -42,8 +42,8 @@
4242
"source": [
4343
"# xgboost model\n",
4444
"if not path.exists(IOT_XGBOOST_MODEL):\n",
45-
" fs = s3fs.S3FileSystem(anon=True)\n",
46-
" fs.get(S3_BASE_PATH + \"/\" + IOT_XGBOOST_MODEL, IOT_XGBOOST_MODEL)"
45+
" r = requests.get(DATA_BASE_URL + IOT_XGBOOST_MODEL)\n",
46+
" open(IOT_XGBOOST_MODEL, 'wb').write(r.content)"
4747
]
4848
},
4949
{
@@ -54,8 +54,8 @@
5454
"source": [
5555
"# IoT data in json format\n",
5656
"if not path.exists(IOT_MALWARE_JSON):\n",
57-
" fs = s3fs.S3FileSystem(anon=True)\n",
58-
" fs.get(S3_BASE_PATH + \"/\" + IOT_MALWARE_JSON, IOT_MALWARE_JSON)"
57+
" r = requests.get(DATA_BASE_URL + IOT_MALWARE_JSON)\n",
58+
" open(IOT_MALWARE_JSON, 'wb').write(r.content)"
5959
]
6060
},
6161
{

examples/forest_inference/xgboost_streamz_pipeline.ipynb

+6-6
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424
"metadata": {},
2525
"outputs": [],
2626
"source": [
27-
"import s3fs\n",
27+
"import requests\n",
2828
"from os import path\n",
2929
"\n",
3030
"# Download sample data and model\n",
3131
"IOT_MALWARE_JSON=\"iot_malware_1_1.json\"\n",
3232
"IOT_XGBOOST_MODEL=\"iot_xgboost_model.bst\"\n",
33-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\""
33+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\""
3434
]
3535
},
3636
{
@@ -41,8 +41,8 @@
4141
"source": [
4242
"# xgboost model\n",
4343
"if not path.exists(IOT_XGBOOST_MODEL):\n",
44-
" fs = s3fs.S3FileSystem(anon=True)\n",
45-
" fs.get(S3_BASE_PATH + \"/\" + IOT_XGBOOST_MODEL, IOT_XGBOOST_MODEL)"
44+
" r = requests.get(DATA_BASE_URL + IOT_XGBOOST_MODEL)\n",
45+
" open(IOT_XGBOOST_MODEL, 'wb').write(r.content)"
4646
]
4747
},
4848
{
@@ -53,8 +53,8 @@
5353
"source": [
5454
"# IoT data in json format\n",
5555
"if not path.exists(IOT_MALWARE_JSON):\n",
56-
" fs = s3fs.S3FileSystem(anon=True)\n",
57-
" fs.get(S3_BASE_PATH + \"/\" + IOT_MALWARE_JSON, IOT_MALWARE_JSON)"
56+
" r = requests.get(DATA_BASE_URL + IOT_MALWARE_JSON)\n",
57+
" open(IOT_MALWARE_JSON, 'wb').write(r.content)"
5858
]
5959
},
6060
{

examples/forest_inference/xgboost_training.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
"import cudf\n",
3333
"from cuml.preprocessing.model_selection import train_test_split\n",
3434
"\n",
35-
"import s3fs\n",
35+
"import requests\n",
3636
"from os import path"
3737
]
3838
},
@@ -51,12 +51,12 @@
5151
"source": [
5252
"# Download sample data and model\n",
5353
"IOT_MALWARE_JSON=\"iot_malware_1_1.json\"\n",
54-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\"\n",
54+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\"\n",
5555
"\n",
5656
"# IoT data in json format\n",
5757
"if not path.exists(IOT_MALWARE_JSON):\n",
58-
" fs = s3fs.S3FileSystem(anon=True)\n",
59-
" fs.get(S3_BASE_PATH + \"/\" + IOT_MALWARE_JSON, IOT_MALWARE_JSON)\n",
58+
" r = requests.get(DATA_BASE_URL + IOT_MALWARE_JSON)\n",
59+
" open(IOT_MALWARE_JSON, 'wb').write(r.content)\n",
6060
" "
6161
]
6262
},

examples/streamz/Dockerfile

+3-3
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ RUN wget -q http://models.huggingface.co.s3.amazonaws.com/bert/raykallen/cybert_
5050
RUN wget -q http://models.huggingface.co.s3.amazonaws.com/bert/raykallen/cybert_apache_parser/pytorch_model.bin -O "$CLX_STREAMZ_HOME"/ml/models/cybert/pytorch_model.bin
5151

5252
# Download apache logs
53-
RUN wget -q https://rapidsai-data.s3.us-east-2.amazonaws.com/cyber/clx/apache_raw_sample_1k.txt -O "$CLX_STREAMZ_HOME"/data/apache_raw_sample_1k.txt
53+
RUN wget -q https://data.rapids.ai/cyber/clx/apache_raw_sample_1k.txt -O "$CLX_STREAMZ_HOME"/data/apache_raw_sample_1k.txt
5454

5555
# Download dga detection model and sample input data
56-
RUN wget -q https://rapidsai-data.s3.us-east-2.amazonaws.com/cyber/clx/dga_detection_pytorch_model.bin -O "$CLX_STREAMZ_HOME"/ml/models/dga/pytorch_model.bin
57-
RUN wget -q https://rapidsai-data.s3.us-east-2.amazonaws.com/cyber/clx/dga_detection_input.jsonlines -O "$CLX_STREAMZ_HOME"/data/dga_detection_input.jsonlines
56+
RUN wget -q https://data.rapids.ai/cyber/clx/dga_detection_pytorch_model.bin -O "$CLX_STREAMZ_HOME"/ml/models/dga/pytorch_model.bin
57+
RUN wget -q https://data.rapids.ai/cyber/clx/dga_detection_input.jsonlines -O "$CLX_STREAMZ_HOME"/data/dga_detection_input.jsonlines
5858

5959
# Zookeeper
6060
EXPOSE 2181

notebooks/alert_analysis/Alert_Analysis_with_CLX.ipynb

+3-3
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
"from cuxfilter.charts import datashader, bokeh\n",
6767
"import panel as pn\n",
6868
"\n",
69-
"import s3fs\n",
69+
"import requests\n",
7070
"from os import path"
7171
]
7272
},
@@ -108,8 +108,8 @@
108108
"outputs": [],
109109
"source": [
110110
"if not path.exists(\"./splunk_faker_raw4\"):\n",
111-
" fs = s3fs.S3FileSystem(anon=True)\n",
112-
" fs.get(\"rapidsai-data/cyber/clx/splunk_faker_raw4\", \"./splunk_faker_raw4\")"
111+
" r = requests.get('https://data.rapids.ai/cyber/clx/splunk_faker_raw4')\n",
112+
" open('./splunk_faker_raw4', 'wb').write(r.content)"
113113
]
114114
},
115115
{

notebooks/cybert/cybert_example_training.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
"outputs": [],
2828
"source": [
2929
"from os import path\n",
30-
"import s3fs\n",
30+
"import requests\n",
3131
"import torch\n",
3232
"import torch.nn as nn\n",
3333
"import torch.nn.functional as F\n",
@@ -62,11 +62,11 @@
6262
"source": [
6363
"# download log data\n",
6464
"APACHE_SAMPLE_CSV = \"apache_sample_1k.csv\"\n",
65-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\"\n",
65+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\"\n",
6666
"\n",
6767
"if not path.exists(APACHE_SAMPLE_CSV):\n",
68-
" fs = s3fs.S3FileSystem(anon=True)\n",
69-
" fs.get(S3_BASE_PATH + \"/\" + APACHE_SAMPLE_CSV, APACHE_SAMPLE_CSV)"
68+
" r = requests.get(DATA_BASE_URL + APACHE_SAMPLE_CSV)\n",
69+
" open(APACHE_SAMPLE_CSV, 'wb').write(r.content)"
7070
]
7171
},
7272
{

notebooks/cybert/cybert_log_parsing.ipynb

+4-3
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"outputs": [],
2626
"source": [
2727
"import cudf\n",
28+
"import requests\n",
2829
"import s3fs\n",
2930
"from os import path\n",
3031
"\n",
@@ -37,7 +38,7 @@
3738
"metadata": {},
3839
"outputs": [],
3940
"source": [
40-
"CLX_S3_BASE_PATH = \"rapidsai-data/cyber/clx\"\n",
41+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\"\n",
4142
"HF_S3_BASE_PATH = \"models.huggingface.co/bert/raykallen/cybert_apache_parser\"\n",
4243
"\n",
4344
"CONFIG_FILENAME = \"config.json\"\n",
@@ -114,8 +115,8 @@
114115
"outputs": [],
115116
"source": [
116117
"if not path.exists(APACHE_SAMPLE_CSV):\n",
117-
" fs = s3fs.S3FileSystem(anon=True)\n",
118-
" fs.get(CLX_S3_BASE_PATH + \"/\" + APACHE_SAMPLE_CSV, APACHE_SAMPLE_CSV)"
118+
" r = requests.get(DATA_BASE_URL + APACHE_SAMPLE_CSV)\n",
119+
" open(APACHE_SAMPLE_CSV, 'wb').write(r.content)"
119120
]
120121
},
121122
{

notebooks/dga_detection/DGA_Detection.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
"import os\n",
3636
"import cudf\n",
3737
"import torch\n",
38-
"import s3fs\n",
38+
"import requests\n",
3939
"import logging\n",
4040
"import numpy as np\n",
4141
"from datetime import datetime\n",
@@ -83,7 +83,7 @@
8383
"source": [
8484
"INPUT_CSV = \"benign_and_dga_domains.csv\"\n",
8585
"\n",
86-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\""
86+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\""
8787
]
8888
},
8989
{
@@ -94,8 +94,8 @@
9494
"source": [
9595
"# Read Benign and DGA dataset\n",
9696
"if not os.path.exists(INPUT_CSV):\n",
97-
" fs = s3fs.S3FileSystem(anon=True)\n",
98-
" fs.get(S3_BASE_PATH + \"/\" + INPUT_CSV, INPUT_CSV)"
97+
" r = requests.get(DATA_BASE_URL + INPUT_CSV)\n",
98+
" open(INPUT_CSV, 'wb').write(r.content)"
9999
]
100100
},
101101
{

notebooks/loda_anomaly_detection/LODA_anomaly_detection.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
"import matplotlib.pylab as plt \n",
4343
"import cuml.metrics as mt\n",
4444
"import wget\n",
45-
"import s3fs;\n",
45+
"import requests;\n",
4646
"from os import path;\n",
4747
"%matplotlib inline \n",
4848
"\n",
@@ -64,11 +64,11 @@
6464
"outputs": [],
6565
"source": [
6666
"SHUTTLE_CSV = \"shuttle.csv\"\n",
67-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\"\n",
67+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\"\n",
6868
"\n",
6969
"if not path.exists(SHUTTLE_CSV):\n",
70-
" fs = s3fs.S3FileSystem(anon=True)\n",
71-
" fs.get(S3_BASE_PATH + \"/\" + SHUTTLE_CSV, SHUTTLE_CSV)\n",
70+
" r = requests.get(DATA_BASE_URL + SHUTTLE_CSV)\n",
71+
" open(SHUTTLE_CSV, 'wb').write(r.content)\n",
7272
" \n",
7373
"df = cudf.read_csv(SHUTTLE_CSV)"
7474
]

notebooks/network_mapping/CLX_Supervised_Asset_Classification.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
"from sklearn.metrics import accuracy_score, f1_score, confusion_matrix\n",
6767
"import pandas as pd\n",
6868
"from os import path\n",
69-
"import s3fs"
69+
"import requests"
7070
]
7171
},
7272
{
@@ -277,13 +277,13 @@
277277
"metadata": {},
278278
"outputs": [],
279279
"source": [
280-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\"\n",
280+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\"\n",
281281
"WINEVT_PREPROC_CSV = \"win_events_features_preproc.csv\"\n",
282282
"\n",
283283
"# Download Zeek conn log\n",
284284
"if not path.exists(WINEVT_PREPROC_CSV):\n",
285-
" fs = s3fs.S3FileSystem(anon=True)\n",
286-
" fs.get(S3_BASE_PATH + \"/\" + WINEVT_PREPROC_CSV, WINEVT_PREPROC_CSV)\n",
285+
" r = requests.get(DATA_BASE_URL + WINEVT_PREPROC_CSV)\n",
286+
" open(WINEVT_PREPROC_CSV, 'wb').write(r.content)\n",
287287
"\n",
288288
"win_events_gdf = cudf.read_csv(\"win_events_features_preproc.csv\")"
289289
]

notebooks/network_mapping/Network_Mapping_With_RAPIDS_And_CLX.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
"\n",
5050
"import pandas as pd\n",
5151
"from os import path\n",
52-
"import s3fs"
52+
"import requests"
5353
]
5454
},
5555
{
@@ -65,13 +65,13 @@
6565
"metadata": {},
6666
"outputs": [],
6767
"source": [
68-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\"\n",
68+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\"\n",
6969
"CONN_LOG = \"conn.log\"\n",
7070
"\n",
7171
"# Download Zeek conn log\n",
7272
"if not path.exists(CONN_LOG):\n",
73-
" fs = s3fs.S3FileSystem(anon=True)\n",
74-
" fs.get(S3_BASE_PATH + \"/\" + CONN_LOG, CONN_LOG)"
73+
" r = requests.get(DATA_BASE_URL + CONN_LOG)\n",
74+
" open(CONN_LOG, 'wb').write(r.content)"
7575
]
7676
},
7777
{

notebooks/network_mapping/custream_n_graph.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
"\n",
5353
"import pandas as pd\n",
5454
"from os import path\n",
55-
"import s3fs\n",
55+
"import requests\n",
5656
"from streamz import Stream"
5757
]
5858
},
@@ -63,13 +63,13 @@
6363
"metadata": {},
6464
"outputs": [],
6565
"source": [
66-
"S3_BASE_PATH = \"rapidsai-data/cyber/clx\"\n",
66+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/clx/\"\n",
6767
"CONN_LOG = \"conn.log\"\n",
6868
"\n",
6969
"# Download Zeek conn log\n",
7070
"if not path.exists(CONN_LOG):\n",
71-
" fs = s3fs.S3FileSystem(anon=True)\n",
72-
" fs.get(S3_BASE_PATH + \"/\" + CONN_LOG, CONN_LOG)"
71+
" r = requests.get(DATA_BASE_URL + CONN_LOG)\n",
72+
" open(CONN_LOG, 'wb').write(r.content)"
7373
]
7474
},
7575
{

notebooks/pii_detection/pii_detection_training_example.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"outputs": [],
3232
"source": [
3333
"from os import path\n",
34-
"import s3fs\n",
34+
"import requests\n",
3535
"import torch\n",
3636
"from torch.nn import BCEWithLogitsLoss\n",
3737
"from transformers import AutoModelForSequenceClassification, AdamW\n",
@@ -64,11 +64,11 @@
6464
"source": [
6565
"# download sample data\n",
6666
"PII_SAMPLE_CSV = \"pii_training_sample.csv\"\n",
67-
"S3_BASE_PATH = \"rapidsai-data/cyber/pii\"\n",
67+
"DATA_BASE_URL = \"https://data.rapids.ai/cyber/pii/\"\n",
6868
"\n",
6969
"if not path.exists(PII_SAMPLE_CSV):\n",
70-
" fs = s3fs.S3FileSystem(anon=True)\n",
71-
" fs.get(S3_BASE_PATH + \"/\" + PII_SAMPLE_CSV, PII_SAMPLE_CSV)"
70+
" r = requests.get(DATA_BASE_URL + PII_SAMPLE_CSV)\n",
71+
" open(PII_SAMPLE_CSV, 'wb').write(r.content)"
7272
]
7373
},
7474
{

0 commit comments

Comments
 (0)