-
Notifications
You must be signed in to change notification settings - Fork 82
Elasticsearch Adapter
This page includes instructions on how to use Elasticsearch and Cloudberry to setup a small instance of TwitterMap on a local machine.
- Linux or MacOS
cd ~
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.7.2.tar.gz
tar -xzf elasticsearch-6.7.2.tar.gz
cd elasticsearch-6.7.2/
-
./bin/elasticsearch
-
Or start on daemon mode:
./bin/elasticsearch -d -p pid
-
To shutdown elasticsearch on daemon mode, kill the process ID in the pid file
pkill -F pid
-
There are 1 index and 1 index template to create:
curl -X PUT "localhost:9200/twitter.ds_tweet" -H 'Content-Type: application/json' -d'
{
"mappings" : {
"_doc" : {
"properties" : {
"create_at" : {"type": "date", "format": "strict_date_time"},
"text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"id": {"type" : "long"},
"hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"in_reply_to_status": {"type" : "object", "enabled": false},
"in_reply_to_user": {"type" : "object", "enabled": false},
"favorite_count": {"type" : "object", "enabled": false},
"lang": {"type" : "object", "enabled": false},
"is_retweet": {"type" : "object", "enabled": false},
"coordinate": {"type" : "object", "enabled": false},
"user_mentions": {"type" : "object", "enabled": false},
"user.id": {"type" : "object", "enabled": false},
"user.name": {"type" : "object", "enabled": false},
"user.screen_name": {"type" : "object", "enabled": false},
"user.lang": {"type" : "object", "enabled": false},
"user.location": {"type" : "object", "enabled": false},
"user.profile_image_url": {"type" : "object", "enabled": false},
"user.create_at" : {"type": "date", "format": "strict_date_time"},
"user.description": {"type" : "object", "enabled": false},
"user.followers_count": {"type": "object", "enabled": false},
"user.friends_count": {"type": "object", "enabled": false},
"user.statues_count": {"type": "object", "enabled": false},
"place.country": {"type": "object", "enabled": false},
"place.country_code": {"type": "object", "enabled": false},
"place.bounding_box": {"type" : "object", "enabled": false},
"place.full_name": {"type": "object", "enabled": false},
"place.id": {"type": "object", "enabled": false},
"place.name": {"type": "object", "enabled": false},
"place.place_type": {"type": "object", "enabled": false},
"geo_tag.stateName": {"type" : "object", "enabled": false},
"geo_tag.countyName": {"type" : "object", "enabled": false},
"geo_tag.cityName": {"type" : "object", "enabled": false},
"geo_tag.stateID": {"type": "long"},
"geo_tag.countyID": {"type": "long"},
"geo_tag.cityID": {"type": "long"}
}
}
},
"settings": {
"index": {
"max_result_window": 2147483647,
"number_of_replicas": 0,
"number_of_shards": 4,
"sort.field": "create_at",
"sort.order": "desc"
}
}
}
'
curl -X PUT "localhost:9200/_template/twitter" -H 'Content-Type: application/json' -d'
{
"index_patterns": ["twitter.ds_tweet_*"],
"mappings" : {
"_doc" : {
"properties" : {
"create_at" : {"type": "date", "format": "strict_date_time"},
"text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"id": {"type" : "long"},
"hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"in_reply_to_status": {"type" : "object", "enabled": false},
"in_reply_to_user": {"type" : "object", "enabled": false},
"favorite_count": {"type" : "object", "enabled": false},
"lang": {"type" : "object", "enabled": false},
"is_retweet": {"type" : "object", "enabled": false},
"coordinate": {"type" : "object", "enabled": false},
"user_mentions": {"type" : "object", "enabled": false},
"user.id": {"type" : "object", "enabled": false},
"user.name": {"type" : "object", "enabled": false},
"user.screen_name": {"type" : "object", "enabled": false},
"user.lang": {"type" : "object", "enabled": false},
"user.location": {"type" : "object", "enabled": false},
"user.profile_image_url": {"type" : "object", "enabled": false},
"user.create_at" : {"type": "date", "format": "strict_date_time"},
"user.description": {"type" : "object", "enabled": false},
"user.followers_count": {"type": "object", "enabled": false},
"user.friends_count": {"type": "object", "enabled": false},
"user.statues_count": {"type": "object", "enabled": false},
"place.country": {"type": "object", "enabled": false},
"place.country_code": {"type": "object", "enabled": false},
"place.bounding_box": {"type" : "object", "enabled": false},
"place.full_name": {"type": "object", "enabled": false},
"place.id": {"type": "object", "enabled": false},
"place.name": {"type": "object", "enabled": false},
"place.place_type": {"type": "object", "enabled": false},
"geo_tag.stateName": {"type" : "object", "enabled": false},
"geo_tag.countyName": {"type" : "object", "enabled": false},
"geo_tag.cityName": {"type" : "object", "enabled": false},
"geo_tag.stateID": {"type": "long"},
"geo_tag.countyID": {"type": "long"},
"geo_tag.cityID": {"type": "long"}
}
}
},
"settings": {
"index": {
"max_result_window": 2147483647,
"number_of_replicas": 0,
"number_of_shards": 4,
"refresh_interval": "10s"
}
}
}
'
- Gunzip
sample.json.gz
undercloudberr/examples/twittermap/script/
gunzip sample.json.gz
- Create
prepareDataForElastic.py
file, copy the following content into it and save it
from collections import OrderedDict
import json
with open("sample.json", "r") as f, open("twitter.json", "w") as g:
for line in f:
data = json.loads(line, object_pairs_hook=OrderedDict)
g.write("{\"index\":{\"_id\":\"" + str(data["id"]) + "\"}}\n")
data["create_at"] = ("T").join((data["create_at"] + "-0800").split())
g.write(json.dumps(data) + "\n")
- Convert the
sample.json
file intotwitter.json
file by runningprepareDataForElastic.py
python prepareDataForElastic.py
- Move
twitter.json
to the elasticsearch directory:~/elasticsearch-6.7.2/
curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter.json"
Edit file: cloudberry/cloudberry/neo/conf/application.conf
-
line 89:
asterixdb.url = "http://localhost:19002/query/service"
-
line 96:
asterixdb.lang = SQLPP
-
line 93:
#elasticsearch.url = "http://localhost:9200"
-
line 101:
#asterixdb.lang = elasticsearch
-
line 86:
berry.firstquery.gap = "60 days"
-
line 87:
berry.query.gap = "180 days"
Now you can start Cloudberry and Twittermap as normal!