-
Notifications
You must be signed in to change notification settings - Fork 82
Elasticsearch Adapter
This page includes instructions on how to use Elasticsearch and Cloudberry to setup a small instance of TwitterMap on a local machine.
- Linux or MacOS
cd ~
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.7.2.tar.gz
tar -xzf elasticsearch-6.7.2.tar.gz
cd elasticsearch-6.7.2/
-
./bin/elasticsearch
-
Or start on daemon mode:
./bin/elasticsearch -d -p pid
-
To shutdown elasticsearch on daemon mode, kill the process ID in the pid file
pkill -F pid
-
There are 1 index and 1 index template to create:
curl -X PUT "localhost:9200/twitter.ds_tweet" -H 'Content-Type: application/json' -d'
{
"mappings" : {
"_doc" : {
"properties" : {
"create_at" : {"type": "date", "format": "strict_date_time"},
"text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"id": {"type" : "long"},
"hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"in_reply_to_status": {"type" : "object", "enabled": false},
"in_reply_to_user": {"type" : "object", "enabled": false},
"favorite_count": {"type" : "object", "enabled": false},
"lang": {"type" : "object", "enabled": false},
"is_retweet": {"type" : "object", "enabled": false},
"coordinate": {"type" : "object", "enabled": false},
"user_mentions": {"type" : "object", "enabled": false},
"user.id": {"type" : "object", "enabled": false},
"user.name": {"type" : "object", "enabled": false},
"user.screen_name": {"type" : "object", "enabled": false},
"user.lang": {"type" : "object", "enabled": false},
"user.location": {"type" : "object", "enabled": false},
"user.profile_image_url": {"type" : "object", "enabled": false},
"user.create_at" : {"type": "date", "format": "strict_date_time"},
"user.description": {"type" : "object", "enabled": false},
"user.followers_count": {"type": "object", "enabled": false},
"user.friends_count": {"type": "object", "enabled": false},
"user.statues_count": {"type": "object", "enabled": false},
"place.country": {"type": "object", "enabled": false},
"place.country_code": {"type": "object", "enabled": false},
"place.bounding_box": {"type" : "object", "enabled": false},
"place.full_name": {"type": "object", "enabled": false},
"place.id": {"type": "object", "enabled": false},
"place.name": {"type": "object", "enabled": false},
"place.place_type": {"type": "object", "enabled": false},
"geo_tag.stateName": {"type" : "object", "enabled": false},
"geo_tag.countyName": {"type" : "object", "enabled": false},
"geo_tag.cityName": {"type" : "object", "enabled": false},
"geo_tag.stateID": {"type": "long"},
"geo_tag.countyID": {"type": "long"},
"geo_tag.cityID": {"type": "long"}
}
}
},
"settings": {
"index": {
"max_result_window": 2147483647,
"number_of_replicas": 0,
"number_of_shards": 4,
"sort.field": "create_at",
"sort.order": "desc"
}
}
}
'
curl -X PUT "localhost:9200/_template/twitter" -H 'Content-Type: application/json' -d'
{
"index_patterns": ["twitter.ds_tweet_*"],
"mappings" : {
"_doc" : {
"properties" : {
"create_at" : {"type": "date", "format": "strict_date_time"},
"text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"id": {"type" : "long"},
"hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"in_reply_to_status": {"type" : "object", "enabled": false},
"in_reply_to_user": {"type" : "object", "enabled": false},
"favorite_count": {"type" : "object", "enabled": false},
"lang": {"type" : "object", "enabled": false},
"is_retweet": {"type" : "object", "enabled": false},
"coordinate": {"type" : "object", "enabled": false},
"user_mentions": {"type" : "object", "enabled": false},
"user.id": {"type" : "object", "enabled": false},
"user.name": {"type" : "object", "enabled": false},
"user.screen_name": {"type" : "object", "enabled": false},
"user.lang": {"type" : "object", "enabled": false},
"user.location": {"type" : "object", "enabled": false},
"user.profile_image_url": {"type" : "object", "enabled": false},
"user.create_at" : {"type": "date", "format": "strict_date_time"},
"user.description": {"type" : "object", "enabled": false},
"user.followers_count": {"type": "object", "enabled": false},
"user.friends_count": {"type": "object", "enabled": false},
"user.statues_count": {"type": "object", "enabled": false},
"place.country": {"type": "object", "enabled": false},
"place.country_code": {"type": "object", "enabled": false},
"place.bounding_box": {"type" : "object", "enabled": false},
"place.full_name": {"type": "object", "enabled": false},
"place.id": {"type": "object", "enabled": false},
"place.name": {"type": "object", "enabled": false},
"place.place_type": {"type": "object", "enabled": false},
"geo_tag.stateName": {"type" : "object", "enabled": false},
"geo_tag.countyName": {"type" : "object", "enabled": false},
"geo_tag.cityName": {"type" : "object", "enabled": false},
"geo_tag.stateID": {"type": "long"},
"geo_tag.countyID": {"type": "long"},
"geo_tag.cityID": {"type": "long"}
}
}
},
"settings": {
"index": {
"max_result_window": 2147483647,
"number_of_replicas": 0,
"number_of_shards": 4,
"refresh_interval": "10s"
}
}
}
'
- Download the synthetic sample tweets (about 100K) data into
cloudberry/examples/twittermap/script/
path:
(Note: this file is sample.json.zip
, different from the sample.adm.gz
file in Quick Start tutorial)
wget http://cloudberry.ics.uci.edu/img/sample.json.zip
- Unzip
sample.json.zip
undercloudberry/examples/twittermap/script/
unzip sample.json.zip
-
Create
prepareDataForElastic.py
file -
copy the following content into it
from collections import OrderedDict
import json
i = 0
count = 0
with open("sample.json", "r") as f:
buffer = []
for line in f:
count += 1
data = json.loads(line, object_pairs_hook=OrderedDict)
buffer.append("{\"index\":{\"_id\":\"" + str(data["id"]) + "\"}}\n")
data["create_at"] = ("T").join((data["create_at"][:-1] + "-0800").split())
data["user"]["create_at"] = data["user"]["create_at"] + "T00:00:00.000-0800"
buffer.append(json.dumps(data) + "\n")
if count == 40000:
with open("twitter-" + str(i) + ".json", "w") as g:
file = ("".join(buffer)).encode("utf-8")
g.write(file)
buffer = []
count = 0
i += 1
if count > 0:
with open("twitter-" + str(i) + ".json", "w") as g:
file = ("".join(buffer)).encode("utf-8")
g.write(file)
-
Save
prepareDataForElastic.py
file -
Convert the
sample.json
file intotwitter-i.json
files by runningprepareDataForElastic.py
python prepareDataForElastic.py
After Step 4.1
, there will be 3 files (twitter-0.json
, twitter-1.json
and twitter-2.json
) under twittermap/script
path, run the following 3 commands to ingest them into ElasticSearch.
curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter-0.json"
curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter-1.json"
curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter-2.json"
Edit file: cloudberry/cloudberry/neo/conf/application.conf
-
line 89:
asterixdb.url = "http://localhost:19002/query/service"
-
line 96:
asterixdb.lang = SQLPP
-
line 93:
#elasticsearch.url = "http://localhost:9200"
-
line 101:
#asterixdb.lang = elasticsearch
-
line 86:
berry.firstquery.gap = "60 days"
-
line 87:
berry.query.gap = "180 days"
Now you can start Cloudberry and Twittermap as normal!