-
Notifications
You must be signed in to change notification settings - Fork 82
Elasticsearch Adapter
This page includes instructions on how to use Elasticsearch and Cloudberry to setup a small instance of TwitterMap on a local machine.
- Linux or MacOS
cd ~
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.7.2.tar.gz
tar -xzf elasticsearch-6.7.2.tar.gz
cd elasticsearch-6.7.2/
-
./bin/elasticsearch
-
Or start on daemon mode:
./bin/elasticsearch -d -p pid
-
To shutdown elasticsearch on daemon mode, kill the process ID in the pid file
pkill -F pid
-
There are 1 index and 1 index template to create:
curl -X PUT "localhost:9200/twitter.ds_tweet" -H 'Content-Type: application/json' -d'
{
"mappings" : {
"_doc" : {
"properties" : {
"create_at" : {"type": "date", "format": "strict_date_time"},
"text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"id": {"type" : "long"},
"hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"in_reply_to_status": {"type" : "object", "enabled": false},
"in_reply_to_user": {"type" : "object", "enabled": false},
"favorite_count": {"type" : "object", "enabled": false},
"lang": {"type" : "object", "enabled": false},
"is_retweet": {"type" : "object", "enabled": false},
"coordinate": {"type" : "object", "enabled": false},
"user_mentions": {"type" : "object", "enabled": false},
"user.id": {"type" : "object", "enabled": false},
"user.name": {"type" : "object", "enabled": false},
"user.screen_name": {"type" : "object", "enabled": false},
"user.lang": {"type" : "object", "enabled": false},
"user.location": {"type" : "object", "enabled": false},
"user.profile_image_url": {"type" : "object", "enabled": false},
"user.create_at" : {"type": "date", "format": "strict_date_time"},
"user.description": {"type" : "object", "enabled": false},
"user.followers_count": {"type": "object", "enabled": false},
"user.friends_count": {"type": "object", "enabled": false},
"user.statues_count": {"type": "object", "enabled": false},
"place.country": {"type": "object", "enabled": false},
"place.country_code": {"type": "object", "enabled": false},
"place.bounding_box": {"type" : "object", "enabled": false},
"place.full_name": {"type": "object", "enabled": false},
"place.id": {"type": "object", "enabled": false},
"place.name": {"type": "object", "enabled": false},
"place.place_type": {"type": "object", "enabled": false},
"geo_tag.stateName": {"type" : "object", "enabled": false},
"geo_tag.countyName": {"type" : "object", "enabled": false},
"geo_tag.cityName": {"type" : "object", "enabled": false},
"geo_tag.stateID": {"type": "long"},
"geo_tag.countyID": {"type": "long"},
"geo_tag.cityID": {"type": "long"}
}
}
},
"settings": {
"index": {
"max_result_window": 2147483647,
"number_of_replicas": 0,
"number_of_shards": 4,
"sort.field": "create_at",
"sort.order": "desc"
}
}
}
'
curl -X PUT "localhost:9200/_template/twitter" -H 'Content-Type: application/json' -d'
{
"index_patterns": ["twitter.ds_tweet_*"],
"mappings" : {
"_doc" : {
"properties" : {
"create_at" : {"type": "date", "format": "strict_date_time"},
"text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"id": {"type" : "long"},
"hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
"in_reply_to_status": {"type" : "object", "enabled": false},
"in_reply_to_user": {"type" : "object", "enabled": false},
"favorite_count": {"type" : "object", "enabled": false},
"lang": {"type" : "object", "enabled": false},
"is_retweet": {"type" : "object", "enabled": false},
"coordinate": {"type" : "object", "enabled": false},
"user_mentions": {"type" : "object", "enabled": false},
"user.id": {"type" : "object", "enabled": false},
"user.name": {"type" : "object", "enabled": false},
"user.screen_name": {"type" : "object", "enabled": false},
"user.lang": {"type" : "object", "enabled": false},
"user.location": {"type" : "object", "enabled": false},
"user.profile_image_url": {"type" : "object", "enabled": false},
"user.create_at" : {"type": "date", "format": "strict_date_time"},
"user.description": {"type" : "object", "enabled": false},
"user.followers_count": {"type": "object", "enabled": false},
"user.friends_count": {"type": "object", "enabled": false},
"user.statues_count": {"type": "object", "enabled": false},
"place.country": {"type": "object", "enabled": false},
"place.country_code": {"type": "object", "enabled": false},
"place.bounding_box": {"type" : "object", "enabled": false},
"place.full_name": {"type": "object", "enabled": false},
"place.id": {"type": "object", "enabled": false},
"place.name": {"type": "object", "enabled": false},
"place.place_type": {"type": "object", "enabled": false},
"geo_tag.stateName": {"type" : "object", "enabled": false},
"geo_tag.countyName": {"type" : "object", "enabled": false},
"geo_tag.cityName": {"type" : "object", "enabled": false},
"geo_tag.stateID": {"type": "long"},
"geo_tag.countyID": {"type": "long"},
"geo_tag.cityID": {"type": "long"}
}
}
},
"settings": {
"index": {
"max_result_window": 2147483647,
"number_of_replicas": 0,
"number_of_shards": 4,
"refresh_interval": "10s"
}
}
}
'
- Unzip
sample.json.zip
undercloudberr/examples/twittermap/script/
unzip sample.json.zip
- Create
prepareDataForElastic.py
file, copy the following content into it and save it Create file
vi prepareDataForElastic.py
Copy the following content
from collections import OrderedDict
import json
with open("sample.json", "r") as f, open("twitter.json", "w") as g:
for line in f:
data = json.loads(line, object_pairs_hook=OrderedDict)
g.write("{\"index\":{\"_id\":\"" + str(data["id"]) + "\"}}\n")
data["create_at"] = ("T").join((data["create_at"] + "-0800").split())
g.write(json.dumps(data) + "\n")
Save file
- Convert the
sample.json
file intotwitter.json
file by runningprepareDataForElastic.py
python prepareDataForElastic.py
- Move
twitter.json
to the elasticsearch directory:~/elasticsearch-6.7.2/
curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter.json"
Edit file: cloudberry/cloudberry/neo/conf/application.conf
-
line 89:
asterixdb.url = "http://localhost:19002/query/service"
-
line 96:
asterixdb.lang = SQLPP
-
line 93:
#elasticsearch.url = "http://localhost:9200"
-
line 101:
#asterixdb.lang = elasticsearch
-
line 86:
berry.firstquery.gap = "60 days"
-
line 87:
berry.query.gap = "180 days"
Now you can start Cloudberry and Twittermap as normal!