Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data Tools with Twitter Ingestion Server, Twitter GeoTagger and AsterixDB Ingestion Server. #807

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
1144a53
add twitter ingestion server and worker and config classes
baiqiushi May 2, 2020
3c11419
TwitterIngestionServer with admin console and proxy websocket port th…
baiqiushi May 4, 2020
5311674
add TwitterGeoTagger module
baiqiushi Jun 17, 2020
fc00812
A working version of AsterixDBIngestionDriver
baiqiushi Jun 21, 2020
d10745b
add support for entities and extended_entities columns in AsterixDBAd…
baiqiushi Jun 22, 2020
ebc05cd
Merge branch 'master' into data-driver
baiqiushi Jun 29, 2020
97db1ed
Merge branch 'master' into data-driver
baiqiushi Dec 29, 2020
6bd3e50
Add -skip argument for TwitterGeoTagger commandline tool model which …
baiqiushi Jan 27, 2021
0538333
ignore invalid json tweets format for TwitterGeoTagger
baiqiushi Jan 28, 2021
c9d4007
[twittermap-datatools] resolve the http-client logging in debug mode …
baiqiushi Dec 24, 2021
1bdce2b
[twittermap-datatools] (1) add support for weekly and monthly ingesti…
baiqiushi Dec 29, 2021
e62b74e
[twittermap-datatools] minor fix in TwitterIngestionConfig.
baiqiushi Dec 29, 2021
eb0b16d
[twittermap-datatools] (1) fix the bug that AsterixDBIngestionDriver …
baiqiushi Dec 30, 2021
657fb1a
[twittermap-datatools] fix issues in AsterixDBAdapterForTwitterMap.
baiqiushi Dec 31, 2021
4f370be
[twittermap-datatools] fix bug in AsterixDBAdapterForTwitterMap.
baiqiushi Jan 11, 2022
a16d8d2
[twittermap-datatools] minor fix in AsterixDBAdapterForTwitterMap and…
baiqiushi Jan 16, 2022
3786af4
[twittermap-datatools] fix parseDate and formatDate multi-threading u…
baiqiushi Jan 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ server.pid

.DS_Store
.idea/
.vscode/
6 changes: 6 additions & 0 deletions examples/twittermap/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,9 @@ lazy val guardian = (project in file("guardian")).
settings(
libraryDependencies ++= guardianDependencies
)

lazy val datatools = (project in file("datatools")).
settings(Commons.settings: _*).
settings(
libraryDependencies ++= datatoolsDependencies
).dependsOn(gnosis, util)
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package edu.uci.ics.cloudberry.datatools.asterixdb;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.Map;

/**
* AsterixDBAdapter
*
* - provides APIs to transform a JSON record to AsterixDB format (currently ADM)
*
* @author Qiushi Bai
*/
public interface AsterixDBAdapter {

SimpleDateFormat tweetDateFormat = new SimpleDateFormat("EEE MMM d HH:mm:ss z yyyy", Locale.US);
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
SimpleDateFormat timeFormat = new SimpleDateFormat("HH:mm:ss.SSSZZZZ");

String DATE = "date";
String DATETIME = "datetime";
String INT64 = "int64";
String STRING = "string"; // quoted value (suitable for string)
String VALUE = "value"; // no quoted value (suitable for int, boolean types)
String STRING_SET = "string_set"; // list of quoted value
String VALUE_SET = "value_set"; // list of no quoted value
String BOUNDING_BOX = "bounding_box"; // special treatment to bounding_box column
String OBJECT = "object"; // just use ObjectMapper to write it to string

String transform(String tweet) throws Exception;

String transform(Map<String, Object> tuple) throws Exception;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package edu.uci.ics.cloudberry.datatools.asterixdb;

import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.*;

/**
* AsterixDBAdapterForTwitter
*
* - Implementation of AsterixDBAdapter for general Twitter data
*
* TODO - This does not work for now,
* because the mapper.writeValueAsString() will quote `datetime` function call in the output String
*
* @author Qiushi Bai
*/
public class AsterixDBAdapterForGeneralTwitter implements AsterixDBAdapter {

public AsterixDBAdapterForGeneralTwitter() {

// Twitter uses UTC timezone
tweetDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
}

public String transform(String tweet) throws Exception {
ObjectMapper mapper = new ObjectMapper();
Map<String, Object> tuple = mapper.readValue(tweet, Map.class);
return transform(tuple);
}

public String transform(Map<String, Object> tuple) throws Exception {

/**
* (1) Make sure "text" is always Non-truncated.
* - if twitter is 'truncated',
* use the 'extended_tweet'->'full_text' to replace 'text'
* */
if (tuple.containsKey("truncated") && (Boolean)tuple.get("truncated")) {
if (tuple.containsKey("extended_tweet")) {
Map<String, Object> extendedTweet = (Map<String, Object>) tuple.get("extended_tweet");
if (extendedTweet.containsKey("full_text")) {
tuple.put("text", extendedTweet.get("full_text"));
}
}
}

/**
* (2) Transform all 'created_at' attributes to be 'datetime' recursively
* */
transformCreatedAt(tuple);

// write back to String
ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(tuple);
}

public void transformCreatedAt(Map<String, Object> object) throws Exception {
if (object == null) return;

// traverse all attributes of object
for (Map.Entry<String, Object> entry: object.entrySet()) {
// if attribute is an object
if (entry.getValue() instanceof Map) {
// recursive call
transformCreatedAt((Map<String, Object>)entry.getValue());
}
// else attribute is flat
else {
// if this attribute is called 'created_at'
if (entry.getKey().equalsIgnoreCase("created_at")) {
Date date = AsterixDBAdapter.getDate((String) entry.getValue());
entry.setValue("datetime(\"" + dateFormat.format(date) + "T" + timeFormat.format(date) + "\")");
}
}
}
}
}
Loading