Skip to content

Elasticsearch Adapter

QIUSHI BAI edited this page Aug 16, 2019 · 21 revisions

Elasticsearch Adapter

This page includes instructions on how to use Elasticsearch and Cloudberry to setup a small instance of TwitterMap on a local machine.

System requirements:

  • Linux or MacOS

Step 1: Download elasticsearch 6.7.* (in this example 6.7.2)

Step 1.1: Move to your home directory

cd ~

Step 1.2: Download elasticsearch

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.7.2.tar.gz

Step 1.3: Uncompress this file

tar -xzf elasticsearch-6.7.2.tar.gz

Step 1.4: Move to elasticsearch-6.7.2/ directory

cd elasticsearch-6.7.2/

Step 2: Start elasticsearch

Step 2.1: Run elasticsearch

  • ./bin/elasticsearch

  • Or start on daemon mode: ./bin/elasticsearch -d -p pid

    • To shutdown elasticsearch on daemon mode, kill the process ID in the pid file

      pkill -F pid

Step 3: Create Index

There are 1 index and 1 index template to create:

Step 3.1: Create twitter.ds_tweet

curl -X PUT "localhost:9200/twitter.ds_tweet" -H 'Content-Type: application/json' -d'
{
    "mappings" : {
        "_doc" : {
            "properties" : {    
                "create_at" : {"type": "date", "format": "strict_date_time"},
                "text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
                "id": {"type" : "long"},
                "hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
                "in_reply_to_status": {"type" : "object", "enabled": false},
                "in_reply_to_user": {"type" : "object", "enabled": false},
                "favorite_count": {"type" : "object", "enabled": false},
                "lang": {"type" : "object", "enabled": false},
                "is_retweet": {"type" : "object", "enabled": false},
                "coordinate": {"type" : "object", "enabled": false},
                "user_mentions": {"type" : "object", "enabled": false},
                "user.id": {"type" : "object", "enabled": false},
                "user.name": {"type" : "object", "enabled": false},
                "user.screen_name": {"type" : "object", "enabled": false},
                "user.lang": {"type" : "object", "enabled": false},
                "user.location": {"type" : "object", "enabled": false},
                "user.profile_image_url": {"type" : "object", "enabled": false},
                "user.create_at" : {"type": "date", "format": "strict_date_time"},
                "user.description": {"type" : "object", "enabled": false},
                "user.followers_count": {"type": "object", "enabled": false},
                "user.friends_count": {"type": "object", "enabled": false},
                "user.statues_count": {"type": "object", "enabled": false},
                "place.country": {"type": "object", "enabled": false},
                "place.country_code": {"type": "object", "enabled": false},
                "place.bounding_box": {"type" : "object", "enabled": false},
                "place.full_name": {"type": "object", "enabled": false},
                "place.id": {"type": "object", "enabled": false},
                "place.name": {"type": "object", "enabled": false},
                "place.place_type": {"type": "object", "enabled": false},
                "geo_tag.stateName": {"type" : "object", "enabled": false},
                "geo_tag.countyName": {"type" : "object", "enabled": false},
                "geo_tag.cityName": {"type" : "object", "enabled": false},
                "geo_tag.stateID": {"type": "long"},
                "geo_tag.countyID": {"type": "long"},
                "geo_tag.cityID": {"type": "long"}
            }
        }
    },
    "settings": {
        "index": {
	        "max_result_window": 2147483647,
	        "number_of_replicas": 0,
	        "number_of_shards": 4,
	        "sort.field": "create_at",
	        "sort.order": "desc"
        }
    }
}
'

Step 3.2: Create template for view table

curl -X PUT "localhost:9200/_template/twitter" -H 'Content-Type: application/json' -d'
{
    "index_patterns": ["twitter.ds_tweet_*"],
    "mappings" : {
        "_doc" : {
            "properties" : {    
                "create_at" : {"type": "date", "format": "strict_date_time"},
                "text": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
                "id": {"type" : "long"},
                "hashtags": {"type": "text", "fields": {"keyword": {"type": "keyword","ignore_above": 256}}},
                "in_reply_to_status": {"type" : "object", "enabled": false},
                "in_reply_to_user": {"type" : "object", "enabled": false},
                "favorite_count": {"type" : "object", "enabled": false},
                "lang": {"type" : "object", "enabled": false},
                "is_retweet": {"type" : "object", "enabled": false},
                "coordinate": {"type" : "object", "enabled": false},
                "user_mentions": {"type" : "object", "enabled": false},
                "user.id": {"type" : "object", "enabled": false},
                "user.name": {"type" : "object", "enabled": false},
                "user.screen_name": {"type" : "object", "enabled": false},
                "user.lang": {"type" : "object", "enabled": false},
                "user.location": {"type" : "object", "enabled": false},
                "user.profile_image_url": {"type" : "object", "enabled": false},
                "user.create_at" : {"type": "date", "format": "strict_date_time"},
                "user.description": {"type" : "object", "enabled": false},
                "user.followers_count": {"type": "object", "enabled": false},
                "user.friends_count": {"type": "object", "enabled": false},
                "user.statues_count": {"type": "object", "enabled": false},
                "place.country": {"type": "object", "enabled": false},
                "place.country_code": {"type": "object", "enabled": false},
                "place.bounding_box": {"type" : "object", "enabled": false},
                "place.full_name": {"type": "object", "enabled": false},
                "place.id": {"type": "object", "enabled": false},
                "place.name": {"type": "object", "enabled": false},
                "place.place_type": {"type": "object", "enabled": false},
                "geo_tag.stateName": {"type" : "object", "enabled": false},
                "geo_tag.countyName": {"type" : "object", "enabled": false},
                "geo_tag.cityName": {"type" : "object", "enabled": false},
                "geo_tag.stateID": {"type": "long"},
                "geo_tag.countyID": {"type": "long"},
                "geo_tag.cityID": {"type": "long"}
            }
        }
    },
    "settings": {
        "index": {
	        "max_result_window": 2147483647,
	        "number_of_replicas": 0,
	        "number_of_shards": 4,
	        "refresh_interval": "10s"
        }
    }
}
'

Step 4: Insert data

Step 4.1: Prepare data

  • Download the synthetic sample tweets (about 100K) data into cloudberry/examples/twittermap/script/ path:

(Note: this file is sample.json.zip, different from the sample.adm.gz file in Quick Start tutorial)

wget http://cloudberry.ics.uci.edu/img/sample.json.zip
  • Unzip sample.json.zip under cloudberry/examples/twittermap/script/
unzip sample.json.zip
  • Create prepareDataForElastic.py file

  • copy the following content into it

from collections import OrderedDict
import json

i = 0
count = 0
with open("sample.json", "r") as f:
        buffer = []
        for line in f:
                count += 1
                data = json.loads(line, object_pairs_hook=OrderedDict)
                buffer.append("{\"index\":{\"_id\":\"" + str(data["id"]) + "\"}}\n")
                data["create_at"] = ("T").join((data["create_at"][:-1] + "-0800").split())
                data["user"]["create_at"] = data["user"]["create_at"] + "T00:00:00.000-0800"
                buffer.append(json.dumps(data) + "\n")
                if count == 40000:
                        with open("twitter-" + str(i) + ".json", "w") as g:
                                file = ("".join(buffer)).encode("utf-8")
                                g.write(file)
                        buffer = []
                        count = 0
                        i += 1
        if count > 0:
                with open("twitter-" + str(i) + ".json", "w") as g:
                        file = ("".join(buffer)).encode("utf-8")
                        g.write(file)
  • Save prepareDataForElastic.py file

  • Convert the sample.json file into twitter-i.json files by running prepareDataForElastic.py

python prepareDataForElastic.py

Step 4.2: Ingest data into ElasticSearch

After Step 4.1, there will be 3 files (twitter-0.json, twitter-1.json and twitter-2.json) under twittermap/script path, run the following 3 commands to ingest them into ElasticSearch.

curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter-0.json"

curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter-1.json"

curl -o /dev/null -H "Content-Type: application/json" -XPOST "localhost:9200/twitter.ds_tweet/_doc/_bulk?pretty&refresh" --data-binary "@twitter-2.json"

Step 5: Configure Cloudberry

Edit file: cloudberry/cloudberry/neo/conf/application.conf

Step 5.1: Comment line 89 and 96, which are the asterixDB configurations
  • line 89: asterixdb.url = "http://localhost:19002/query/service"

  • line 96: asterixdb.lang = SQLPP

Step 5.2: Uncomment line 93 and 101, which are the elasticsearch configurations
  • line 93: #elasticsearch.url = "http://localhost:9200"

  • line 101: #asterixdb.lang = elasticsearch

Step 5.3: Update line 86 and line 87, tune DRUM parameters to be more friendly to ElasticSearch
  • line 86: berry.firstquery.gap = "60 days"

  • line 87: berry.query.gap = "180 days"

Now you can start Cloudberry and Twittermap as normal!