updating version

tudarmstadt-lt · Mar 14, 2017 · 599ad8b · 599ad8b
1 parent eecd3fd
commit 599ad8b
Show file tree

Hide file tree

Showing 34 changed files with 893 additions and 219 deletions.
diff --git a/lt.seg.hadoop/pom.xml b/lt.seg.hadoop/pom.xml
@@ -5,10 +5,23 @@
 	<parent>
 	  <groupId>de.tudarmstadt</groupId>
 	  <artifactId>lt.kd-suite</artifactId>
-	  <version>0.7.0</version>
+	  <version>0.7.1</version>
 	</parent>
 
+	<properties>
+		<hadoop.version>2.6.0-cdh5.4.11</hadoop.version>
+	</properties>
+
 	<artifactId>lt.seg.hadoop</artifactId>
+
+	<repositories>
+		<repository>
+			<id>cloudera-releases</id>
+			<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+			<releases><enabled>true</enabled></releases>
+			<snapshots><enabled>false</enabled></snapshots>
+		</repository>
+	</repositories>
 
 	<build>
 		<plugins>
@@ -67,12 +80,12 @@
 							</excludes>
 						</filter>
 					</filters>
-					<transformers>
+					<!-- <transformers>
 						<transformer
 							implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
 							<mainClass>de.tudarmstadt.lt.seg.app.HadoopSegmenter</mainClass>
 						</transformer>
-					</transformers>
+					</transformers> -->
 				</configuration>
 				<executions>
 					<execution>
@@ -83,15 +96,6 @@
 					</execution>
 				</executions>
 			</plugin>
-			<plugin>
-				<groupId>org.codehaus.mojo</groupId>
-				<artifactId>exec-maven-plugin</artifactId>
-				<version>1.2.1</version>
-				<configuration>
-					<mainClass>MainFinder</mainClass>
-					<classpathScope>test</classpathScope>
-				</configuration>
-			</plugin>
 		</plugins>
 	</build>
 	<dependencies>
@@ -106,10 +110,10 @@
 			<version>${project.version}</version>
 		</dependency>
 		<dependency>
-			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-minicluster</artifactId>
-			<version>2.6.0</version>
-			<scope>provided</scope>
+		    <groupId>org.apache.hadoop</groupId>
+		    <artifactId>hadoop-client</artifactId>
+		    <version>${hadoop.version}</version>
+		    <scope>provided</scope>
 		</dependency>
 	</dependencies>
 </project>
diff --git a/lt.seg.hadoop/src/main/java/de/tudarmstadt/lt/seg/app/HadoopSegmenter.java b/lt.seg.hadoop/src/main/java/de/tudarmstadt/lt/seg/app/HadoopSegmenter.java
@@ -15,6 +15,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
@@ -29,24 +30,25 @@
 import org.apache.hadoop.mapred.lib.IdentityReducer;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
+import org.mortbay.log.Log;
 
 import de.tudarmstadt.lt.seg.sentence.ISentenceSplitter;
 import de.tudarmstadt.lt.seg.token.ITokenizer;
 import de.tudarmstadt.lt.utilities.cli.ExtendedGnuParser;
 
-//hadoop jar git/lt.kd/lt.seg.hadoop/target/lt.seg.hadoop-0.5.0c-SNAPSHOT-jar-with-dependencies.jar de.tudarmstadt.lt.seg.app.HadoopSegmenter --file wiki.en/enwiki-20151201-pages-articles --out wiki.en/enwiki-20151201-pages-articles-sent-tok-fl4-nl3-ml2 --keycolumn 0 --textcolumn 2 -fl 4 -nl 3 -ml 2 -Dmapred.job.queue.name=shortrunning
-//
-//
-//hdfs dfs -du -h /user/riedl/denews70M_untokenized
-//
-//-Dmapreduce.job.queuename=${queue}"
-
+/**
+ * 
+ * hadoop jar lt.seg.hadoop-0.7.0-SNAPSHOT-jar-with-dependencies.jar -Dmapreduce.job.queuename=shortrunning de.tudarmstadt.lt.seg.app.HadoopSegmenter --key-column 0 --text-column 2 -f ${in} -o ${out}
+ * 
+ * @author rem
+ *
+ */
 @SuppressWarnings("static-access")
 public class HadoopSegmenter extends Configured implements Tool {
 
 	static {
-		Segmenter.opts.addOption(OptionBuilder.withLongOpt("keycolumn").withArgName("column-of-document-key").hasArg().withDescription("Specify the column that contains the document key starting from 0. Specify '-1' to use line id. (default: '-1').").create("kc"));
-		Segmenter.opts.addOption(OptionBuilder.withLongOpt("textcolumn").withArgName("column-of-document-text").hasArg().withDescription("Specify the column that contains the document starting from 0. Specify '-1' to use whole line. (default: '-1').").create("tc"));
+		Segmenter.opts.addOption(OptionBuilder.withLongOpt("keycolumn").withArgName("column-of-document-key").hasArg().withDescription("Specify the column that contains the document key starting from 0. Specify '-1' to use line id. (default: '-1').").create("ck"));
+		Segmenter.opts.addOption(OptionBuilder.withLongOpt("textcolumn").withArgName("column-of-document-text").hasArg().withDescription("Specify the column that contains the document starting from 0. Specify '-1' to use whole line. (default: '-1').").create("ct"));
 	}
 
 	public static void main(String[] args) throws Exception {
@@ -70,12 +72,12 @@ public int run(String[] args) throws Exception {
 		System.out.println("queuename: " + conf.getQueueName());
 
 		conf.setMapperClass(SegmentationMapper.class);
-		conf.setReducerClass(IdentityReducer.class);
-
+		conf.setNumReduceTasks(0);
+		
 		conf.setInputFormat(TextInputFormat.class);
 		conf.setOutputFormat(TextOutputFormat.class);
 
-		conf.setOutputKeyClass(IntWritable.class);
+		conf.setOutputKeyClass(NullWritable.class);
 		conf.setOutputValueClass(Text.class);
 
 		conf.setStrings("cli-args", args);
@@ -92,7 +94,12 @@ public int run(String[] args) throws Exception {
 		return 0;
 	}
 
-	public static class SegmentationMapper extends MapReduceBase implements Mapper<LongWritable, Text, IntWritable, Text> {
+	public static class SegmentationMapper extends MapReduceBase implements Mapper<LongWritable, Text, NullWritable, Text> {
+
+		static {
+			Segmenter.opts.addOption(OptionBuilder.withLongOpt("keycolumn").withArgName("column-of-document-key").hasArg().withDescription("Specify the column that contains the document key starting from 0. Specify '-1' to use line id. (default: '-1').").create("ck"));
+			Segmenter.opts.addOption(OptionBuilder.withLongOpt("textcolumn").withArgName("column-of-document-text").hasArg().withDescription("Specify the column that contains the document starting from 0. Specify '-1' to use whole line. (default: '-1').").create("ct"));
+		}
 
 		Segmenter _segmenter;
 		ISentenceSplitter _sentenceSplitter;
@@ -120,20 +127,31 @@ public void configure(JobConf job) {
 			super.configure(job);
 		}
 
-		public void map(LongWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException {
+		public void map(LongWritable key, Text value, OutputCollector<NullWritable, Text> output, Reporter reporter) throws IOException {
 
 			reporter.progress();
 
 			String line = value.toString();
 			String docid = String.valueOf(key.get());
+
 			if(_col_text > -1 || _col_key > -1){
 				String[] columns = line.split(Pattern.quote("\t"));
-				if(_col_text > -1)
-					line = columns[_col_text];
-				if(_col_key > -1)
+				if(_col_key > -1){
+					if(columns.length <= _col_key){
+						Log.warn(String.format("Key column does not exist for line '%s': %d, columns: %d.", docid, _col_key, columns.length));
+						return;
+					}
 					docid = columns[_col_key];
+				}
+				if(_col_text > -1){
+					if(columns.length <= _col_text){
+						Log.warn(String.format("Text column does not exist for key='%s': %d, columns: %d.", docid, _col_key, columns.length));
+						return;
+					}
+					line = columns[_col_text];
+				}
 			}
-			
+
 			Reader reader = new StringReader(line.replace("\\t", "\t").replace("\\n", "\n"));
 			StringWriter sw = new StringWriter();
 			PrintWriter writer = new PrintWriter(sw);
@@ -153,11 +171,9 @@ public void map(LongWritable key, Text value, OutputCollector<IntWritable, Text>
 					writer);
 
 			Text out_text = new Text();
-			IntWritable out_id = new IntWritable(0);
 			for(String sentence_line : sw.toString().split("\n")){
 				out_text.set(sentence_line);
-				out_id.set(out_id.get()+1);
-				output.collect(out_id, out_text);
+				output.collect(NullWritable.get(), out_text);
 			}
 
 		}

diff --git a/lt.seg.hadoop/src/main/java/de/tudarmstadt/lt/seg/app/HadoopSegmenterRed.java b/lt.seg.hadoop/src/main/java/de/tudarmstadt/lt/seg/app/HadoopSegmenterRed.java
@@ -0,0 +1,191 @@
+package de.tudarmstadt.lt.seg.app;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.IdentityMapper;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.mortbay.log.Log;
+
+import de.tudarmstadt.lt.seg.sentence.ISentenceSplitter;
+import de.tudarmstadt.lt.seg.token.ITokenizer;
+import de.tudarmstadt.lt.utilities.cli.ExtendedGnuParser;
+
+/**
+ * 
+ * hadoop jar lt.seg.hadoop-0.7.0-SNAPSHOT-jar-with-dependencies.jar de.tudarmstadt.lt.seg.app.HadoopSegmenterRed -Dmapreduce.job.queuename=shortrunning -Dmapreduce.job.reduces=10000 --key-column 0 --text-column 2 -f ${in} -o ${out}
+ * 
+ * @author rem
+ *
+ */
+@SuppressWarnings("static-access")
+public class HadoopSegmenterRed extends Configured implements Tool {
+
+	static {
+		Segmenter.opts.addOption(OptionBuilder.withLongOpt("keycolumn").withArgName("column-of-document-key").hasArg().withDescription("Specify the column that contains the document key starting from 0. Specify '-1' to use line id. (default: '-1').").create("ck"));
+		Segmenter.opts.addOption(OptionBuilder.withLongOpt("textcolumn").withArgName("column-of-document-text").hasArg().withDescription("Specify the column that contains the document starting from 0. Specify '-1' to use whole line. (default: '-1').").create("ct"));
+	}
+
+	public static void main(String[] args) throws Exception {
+		// try to instantiate a segmenter, tokenizer, sentencesplitter. If something doesn't work application is killed already here instead of the mapper
+		Segmenter segmenter = new Segmenter(args);
+		segmenter.newSentenceSplitter();
+		segmenter.newTokenizer();
+
+		int res = ToolRunner.run(new Configuration(),new HadoopSegmenterRed(), args);
+		System.exit(res);
+	}
+
+	public int run(String[] args) throws Exception {
+
+		System.out.println(Arrays.toString(args));
+
+		JobConf conf = new JobConf(getConf(), HadoopSegmenterRed.class);
+		conf.setJobName(HadoopSegmenterRed.class.getSimpleName());
+		//		conf.setQueueName("shortrunning");
+
+		System.out.println("queuename: " + conf.getQueueName());
+
+		conf.setMapperClass(IdentityMapper.class);
+		conf.setReducerClass(SegmentationReducer.class);
+
+		conf.setInputFormat(TextInputFormat.class);
+		conf.setOutputFormat(TextOutputFormat.class);
+
+		conf.setMapOutputKeyClass(LongWritable.class);
+		conf.setMapOutputValueClass(Text.class);
+
+		conf.setOutputKeyClass(NullWritable.class);
+		conf.setOutputValueClass(Text.class);
+
+		conf.setStrings("cli-args", args);
+
+		// instantiate a segmenter to get the options right
+		Segmenter segmenter = new Segmenter(args);
+		FileInputFormat.setInputPaths(conf, new Path(segmenter._filename_in));
+		FileOutputFormat.setOutputPath(conf, new Path(segmenter._filename_out));
+
+		// delete output path (for testing purposes)
+		// FileSystem.get(conf).delete(new Path(args[1]), true);
+
+		JobClient.runJob(conf);
+		return 0;
+	}
+
+	public static class SegmentationReducer extends MapReduceBase implements Reducer<LongWritable, Text, NullWritable, Text> {
+
+		static {
+			Segmenter.opts.addOption(OptionBuilder.withLongOpt("keycolumn").withArgName("column-of-document-key").hasArg().withDescription("Specify the column that contains the document key starting from 0. Specify '-1' to use line id. (default: '-1').").create("ck"));
+			Segmenter.opts.addOption(OptionBuilder.withLongOpt("textcolumn").withArgName("column-of-document-text").hasArg().withDescription("Specify the column that contains the document starting from 0. Specify '-1' to use whole line. (default: '-1').").create("ct"));
+		}
+
+		Segmenter _segmenter;
+		ISentenceSplitter _sentenceSplitter;
+		ITokenizer _tokenizer;
+
+		int _col_key = -1;
+		int _col_text = -1;
+
+		@Override
+		public void configure(JobConf job) {
+			String[] args = job.getStrings("cli-args");
+			System.out.println(Arrays.toString(args));
+
+			_segmenter = new Segmenter(args);
+			try{
+				_sentenceSplitter = _segmenter.newSentenceSplitter();
+				_tokenizer = _segmenter.newTokenizer();
+
+				CommandLine cmd = new ExtendedGnuParser(true).parse(Segmenter.opts, args);
+				_col_key = Integer.parseInt(cmd.getOptionValue("keycolumn", "-1" ));
+				_col_text = Integer.parseInt(cmd.getOptionValue("textcolumn", "-1" ));
+
+			}catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			super.configure(job);
+		}
+
+
+		@Override
+		public void reduce(LongWritable key, Iterator<Text> values, OutputCollector<NullWritable, Text> output, Reporter reporter) throws IOException {
+			reporter.progress();
+
+			String docid = String.valueOf(key.get());
+
+			while(values.hasNext()){
+				String line = values.next().toString();
+
+				if(_col_text > -1 || _col_key > -1){
+					String[] columns = line.split(Pattern.quote("\t"));
+					if(_col_key > -1){
+						if(columns.length <= _col_key){
+							Log.warn(String.format("Key column does not exist for line '%s': %d, columns: %d.", docid, _col_key, columns.length));
+							continue;
+						}
+						docid = columns[_col_key];
+					}
+					if(_col_text > -1){
+						if(columns.length <= _col_text){
+							Log.warn(String.format("Text column does not exist for key='%s': %d, columns: %d.", docid, _col_key, columns.length));
+							continue;
+						}
+						line = columns[_col_text];
+					}
+				}
+
+				Reader reader = new StringReader(line.replace("\\t", "\t").replace("\\n", "\n"));
+				StringWriter sw = new StringWriter();
+				PrintWriter writer = new PrintWriter(sw);
+
+				Segmenter.split_and_tokenize(
+						reader, 
+						docid, 
+						_sentenceSplitter, 
+						_tokenizer, 
+						_segmenter._level_filter,
+						_segmenter._level_normalize,
+						_segmenter._merge_types, 
+						_segmenter._merge_tokens, 
+						"\n",// _segmenter._separator_sentence, 
+						_segmenter._separator_token, 
+						"\t",//_segmenter._separator_desc, 
+						writer);
+
+				Text out_text = new Text();
+				for(String sentence_line : sw.toString().split("\n")){
+					out_text.set(sentence_line);
+					output.collect(NullWritable.get(), out_text);
+				}
+			}
+		}
+
+	}
+
+}