diff --git a/.gitignore b/.gitignore
index a6cfe9a4d..d415250f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
*.ipr
*.iws
*.tmproj
+*.swp
*~
.DS_Store
.classpath
diff --git a/cascading-protobuf/pom.xml b/cascading-protobuf/pom.xml
new file mode 100644
index 000000000..d88fd1c86
--- /dev/null
+++ b/cascading-protobuf/pom.xml
@@ -0,0 +1,37 @@
+
+
+ 4.0.0
+
+ com.twitter.elephantbird
+ elephant-bird
+ 4.13-SNAPSHOT
+ ..
+
+ elephant-bird-cascading-protobuf
+ Elephant Bird Cascading Protobuf
+ Cascading Protobuf utilities.
+
+
+ conjars.org
+ http://conjars.org/repo
+
+
+
+
+ com.google.protobuf
+ protobuf-java
+ ${protobuf.version}
+
+
+ cascading
+ cascading-hadoop
+
+ ${cascading3.version}
+ provided
+
+
+ org.apache.hadoop
+ hadoop-client
+
+
+
diff --git a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufComparator.java b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufComparator.java
similarity index 97%
rename from cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufComparator.java
rename to cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufComparator.java
index e21fd3766..9fbe086dd 100644
--- a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufComparator.java
+++ b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufComparator.java
@@ -1,4 +1,4 @@
-package com.twitter.elephantbird.cascading2.io.protobuf;
+package com.twitter.elephantbird.cascading.protobuf;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
diff --git a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufDeserializer.java b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufDeserializer.java
similarity index 93%
rename from cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufDeserializer.java
rename to cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufDeserializer.java
index 713d46e28..d9a4599e6 100644
--- a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufDeserializer.java
+++ b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufDeserializer.java
@@ -1,4 +1,4 @@
-package com.twitter.elephantbird.cascading2.io.protobuf;
+package com.twitter.elephantbird.cascading.protobuf;
import java.io.IOException;
import java.io.InputStream;
diff --git a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufReflectionUtil.java b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufReflectionUtil.java
similarity index 96%
rename from cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufReflectionUtil.java
rename to cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufReflectionUtil.java
index dd4f6ea10..ee8847f82 100644
--- a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufReflectionUtil.java
+++ b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufReflectionUtil.java
@@ -1,4 +1,4 @@
-package com.twitter.elephantbird.cascading2.io.protobuf;
+package com.twitter.elephantbird.cascading.protobuf;
import java.io.InputStream;
import java.lang.reflect.InvocationTargetException;
diff --git a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufSerialization.java b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerialization.java
similarity index 94%
rename from cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufSerialization.java
rename to cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerialization.java
index 7da6d8f87..b5ad0c461 100644
--- a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufSerialization.java
+++ b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerialization.java
@@ -1,4 +1,4 @@
-package com.twitter.elephantbird.cascading2.io.protobuf;
+package com.twitter.elephantbird.cascading.protobuf;
import java.util.Comparator;
diff --git a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufSerializer.java b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerializer.java
similarity index 91%
rename from cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufSerializer.java
rename to cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerializer.java
index 835261085..14b898950 100644
--- a/cascading2/src/main/java/com/twitter/elephantbird/cascading2/io/protobuf/ProtobufSerializer.java
+++ b/cascading-protobuf/src/main/java/com/twitter/elephantbird/cascading/protobuf/ProtobufSerializer.java
@@ -1,4 +1,4 @@
-package com.twitter.elephantbird.cascading2.io.protobuf;
+package com.twitter.elephantbird.cascading.protobuf;
import java.io.IOException;
import java.io.OutputStream;
diff --git a/cascading2/pom.xml b/cascading2/pom.xml
index 481ffbf8e..2eafe5620 100644
--- a/cascading2/pom.xml
+++ b/cascading2/pom.xml
@@ -32,6 +32,8 @@
cascading
cascading-hadoop
+ ${cascading2.version}
+ provided
junit
diff --git a/cascading3/pom.xml b/cascading3/pom.xml
new file mode 100644
index 000000000..b28cb1640
--- /dev/null
+++ b/cascading3/pom.xml
@@ -0,0 +1,43 @@
+
+
+ 4.0.0
+
+ com.twitter.elephantbird
+ elephant-bird
+ 4.13-SNAPSHOT
+ ..
+
+ elephant-bird-cascading3
+ Elephant Bird Cascading3
+ Cascading utilities.
+
+
+ conjars.org
+ http://conjars.org/repo
+
+
+
+
+ com.twitter.elephantbird
+ elephant-bird-core
+
+
+ org.apache.hadoop
+ hadoop-client
+
+
+ org.slf4j
+ slf4j-simple
+
+
+ cascading
+ cascading-hadoop
+ ${cascading3.version}
+ provided
+
+
+ junit
+ junit
+
+
+
diff --git a/cascading3/src/main/java/com/twitter/elephantbird/cascading3/scheme/CombinedSequenceFile.java b/cascading3/src/main/java/com/twitter/elephantbird/cascading3/scheme/CombinedSequenceFile.java
new file mode 100644
index 000000000..f056131fc
--- /dev/null
+++ b/cascading3/src/main/java/com/twitter/elephantbird/cascading3/scheme/CombinedSequenceFile.java
@@ -0,0 +1,85 @@
+package com.twitter.elephantbird.cascading3.scheme;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+
+import com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper;
+import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat;
+
+import cascading.flow.FlowProcess;
+import cascading.scheme.hadoop.SequenceFile;
+import cascading.tap.Tap;
+import cascading.tuple.Fields;
+
+/**
+ * This scheme allows SequenceFile splits to be combined via the DelegateCombineFileInputFormat
+ * before it is read. It can be used to combine inputs for intermediate MR jobs in Cascading.
+ *
+ * To enable, set cascading.flowconnector.intermediateschemeclass to this class in the Hadoop
+ * configuration.
+ *
+ * @author Akihiro Matsukawa
+ */
+public class CombinedSequenceFile extends SequenceFile {
+
+ private static final String MR_COMPRESS_ENABLE = "mapreduce.output.fileoutputformat.compress";
+ public static final String COMPRESS_ENABLE = "elephantbird.cascading.combinedsequencefile.compress.enable";
+
+ private static final String MR_COMPRESS_TYPE = "mapreduce.output.fileoutputformat.compress.type";
+ public static final String COMPRESS_TYPE = "elephantbird.cascading.combinedsequencefile.compress.type";
+
+ private static final String MR_COMPRESS_CODEC = "mapreduce.output.fileoutputformat.compress.codec";
+ public static final String COMPRESS_CODEC = "elephantbird.cascading.combinedsequencefile.compress.codec";
+
+
+ protected CombinedSequenceFile() { super(); }
+
+ public CombinedSequenceFile(Fields fields) { super(fields); }
+
+ // We can allow overriding the compression settings for just this scheme here
+ private void updateJobConfForLocalSettings(Configuration conf) {
+ String localSetCompressionEnabled = conf.get(COMPRESS_ENABLE);
+ if(localSetCompressionEnabled != null) {
+ conf.set(MR_COMPRESS_ENABLE, localSetCompressionEnabled);
+ }
+
+ String localSetCompressionType = conf.get(COMPRESS_TYPE);
+ if(localSetCompressionType != null) {
+ conf.set(MR_COMPRESS_TYPE, localSetCompressionType);
+ }
+
+ String localSetCompressionCodec = conf.get(COMPRESS_CODEC);
+ if(localSetCompressionCodec != null) {
+ conf.set(MR_COMPRESS_CODEC, localSetCompressionCodec);
+ }
+ }
+
+ @Override
+ public void sourceConfInit(
+ FlowProcess extends Configuration> flowProcess,
+ Tap tap,
+ Configuration conf ) {
+ super.sourceConfInit(flowProcess, tap, conf);
+
+ updateJobConfForLocalSettings(conf);
+
+ // both EB combiner and Cascading3 work over the mapreduce API
+ // however, SequenceFileInputFormat is in the mapred API.
+ // in order to use the EB combiner we must wrap the mapred SequenceFileInputFormat
+ // with the MapReduceInputFormatWrapper and then wrap it in the DelegateCombineFileInputFormat
+ MapReduceInputFormatWrapper.setWrappedInputFormat(SequenceFileInputFormat.class, conf);
+ DelegateCombineFileInputFormat.setDelegateInputFormatHadoop2(conf, MapReduceInputFormatWrapper.class);
+ }
+
+ @Override
+ public void sinkConfInit( FlowProcess extends Configuration> flowProcess, Tap tap, Configuration conf )
+ {
+ super.sinkConfInit(flowProcess, tap, conf);
+
+ updateJobConfForLocalSettings(conf);
+ }
+
+}
diff --git a/cascading3/src/main/java/com/twitter/elephantbird/cascading3/scheme/LzoBinaryScheme.java b/cascading3/src/main/java/com/twitter/elephantbird/cascading3/scheme/LzoBinaryScheme.java
new file mode 100644
index 000000000..6cf28c91b
--- /dev/null
+++ b/cascading3/src/main/java/com/twitter/elephantbird/cascading3/scheme/LzoBinaryScheme.java
@@ -0,0 +1,85 @@
+package com.twitter.elephantbird.cascading3.scheme;
+
+import java.io.IOException;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.RecordReader;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.twitter.elephantbird.mapreduce.io.BinaryWritable;
+
+import cascading.flow.FlowProcess;
+import cascading.scheme.Scheme;
+import cascading.scheme.SinkCall;
+import cascading.scheme.SourceCall;
+import cascading.tap.Tap;
+import cascading.tuple.Tuple;
+import cascading.tuple.TupleEntry;
+
+/**
+ * Scheme for lzo binary encoded files. Handles both block and base 64. Can be used for Protobuf and Thrift.
+ *
+ * @author Argyris Zymnis
+ */
+abstract public class LzoBinaryScheme> extends
+ Scheme {
+
+ private static final Logger LOG = LoggerFactory.getLogger(LzoBinaryScheme.class);
+ private static final long serialVersionUID = -5011096855302946106L;
+
+ @Override
+ public void sink(FlowProcess extends Configuration> flowProcess, SinkCall sinkCall)
+ throws IOException {
+ OutputCollector collector = sinkCall.getOutput();
+ TupleEntry entry = sinkCall.getOutgoingEntry();
+ T writable = sinkCall.getContext();
+ writable.set((M) entry.getTuple().getObject(0));
+ collector.collect(null, writable);
+ }
+
+ @Override
+ public void sinkPrepare( FlowProcess extends Configuration> fp, SinkCall sinkCall ) {
+ sinkCall.setContext(prepareBinaryWritable());
+ }
+
+ protected abstract T prepareBinaryWritable();
+
+ @Override
+ public boolean source(FlowProcess extends Configuration> flowProcess,
+ SourceCall
+
+ ${project.groupId}
+ elephant-bird-cascading3
+ ${project.version}
+
+
+ ${project.groupId}
+ elephant-bird-cascading-protobuf
+ ${project.version}
+
${project.groupId}
elephant-bird-core
@@ -665,6 +677,8 @@
cascading2
+ cascading3
+ cascading-protobuf
crunch
core
hadoop-compat