udated to new segmenter version

tudarmstadt-lt · Mar 14, 2017 · 6b25bd1 · 6b25bd1
1 parent f8ca9a3
commit 6b25bd1
Show file tree

Hide file tree

Showing 36 changed files with 641 additions and 192 deletions.
diff --git a/install.sh b/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-VERSION=0.7.0
+VERSION=0.7.1
 DIR="topicrawler-${VERSION}"
 
 mkdir -p ${DIR}
@@ -27,4 +27,3 @@ tar -xvzf lt.ltbot-*.tar.gz --strip-components 1 -C heritrix-3.2.0
 
 
 
-
diff --git a/lt.lm/pom.xml b/lt.lm/pom.xml
@@ -5,7 +5,7 @@
 	<parent>
 	  <groupId>de.tudarmstadt</groupId>
 	  <artifactId>lt.kd-suite</artifactId>
-	  <version>0.7.0</version>
+	  <version>0.7.1</version>
 	</parent>
 
 	<artifactId>lt.lm</artifactId>

diff --git a/lt.lm/src/main/sh/lm b/lt.lm/src/main/sh/lm
@@ -51,11 +51,14 @@ lib_dir=${lmhome}/lib; while IFS= read -r -d '' f; do cp=${cp}:"${f}"; done < <(
 cp=${cp:1} # remove heading colon
 
 # skip all -D.. and -X.. parameters before the actual main class and add them later to JAVA_OPTS
-DX=''
+DX=""
 while [[ $1 == -D* || $1 == -X* ]]; do
-	DX="$DX $1"
+	DX="$DX "$(printf '%q' "${1}") # quote args
 	shift
 done
+JAVA_OPTS="$JAVA_OPTS $DX"
+args=""
+for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args
 
 mainclass=${1:-MainFinder} # set main class if no further arg was provided
 if [ ! -z "$1" ]; then # if at least one arg was passed

diff --git a/lt.lm/src/main/sh/lm-nightly b/lt.lm/src/main/sh/lm-nightly
@@ -11,4 +11,9 @@ lmsrc="${kdhome}/lt.lm"
 
 tgt=$(find "${lmsrc}/target" -type f -name "lm" | grep "dist/" | head -n1)
 
-eval "${tgt} $@"
+args=""
+for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args
+
+cmd="${tgt} "${args}
+
+eval "${cmd}"
diff --git a/lt.ltbot/pom.xml b/lt.ltbot/pom.xml
@@ -5,7 +5,7 @@
 	<parent>
 	  <groupId>de.tudarmstadt</groupId>
 	  <artifactId>lt.kd-suite</artifactId>
-	  <version>0.7.0</version>
+	  <version>0.7.1</version>
 	</parent>
 
 	<artifactId>lt.ltbot</artifactId>

diff --git a/lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/writer/SentenceMakerJava8.java b/lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/writer/SentenceMakerJava8.java
@@ -59,6 +59,7 @@ public String getTargetLanguageCode(){
 	}
 	public void setTargetLanguageCode(String target_language_code){
 		_target_language_code = target_language_code;
+		_rule_splitter.get().initParam(_target_language_code, false);
 	}
 
 	public Stream<String> getSentencesStream(String text){
@@ -67,7 +68,7 @@ public Stream<String> getSentencesStream(String text){
 
 	public Stream<String> getSentencesStream(String text, String languagecode){
 		return _line_splitter.get().init(new StringReader(text)).stream().filter(s -> s.type == SegmentType.SENTENCE).map(Segment::asString).flatMap(line -> {
-			return _rule_splitter.get().init(new StringReader(line), languagecode).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
+			return _rule_splitter.get().init(new StringReader(line)).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
 				final AtomicInteger c = new AtomicInteger();
 				String r = _tokenizer.get().init(s.asString()).stream().sequential().map(t -> {
 					if(t.isWord())

diff --git a/lt.ltbot/src/test/scripts/prepare_eval.sh b/lt.ltbot/src/test/scripts/prepare_eval.sh
@@ -59,4 +59,27 @@ for t in ai-en vehicles-en plants-en; do for m in nf 1 2 3 5; do d=$b/$t-$m; cat
 find . -name docperps.tsv | while read f; do d=$(dirname $f); cat $f | perl -F\\t -lanE 'if($F[1] =~ /.*[0-9].*/ && $F[1]<1e4){print $_}' > $d/docperps-pr1e4.tsv &  done
 
 
+dirs="ai-en-5 plants-en-5 vehicles-en-5"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done
+
+### steffen@farnsworth:/mnt/farnsworthshare/semeval-2015-task17-texeval/wiki$ zcat wikipedia.txt.gz | wc
+### 109136195 1868200724 11792896391
+### wikipedia 11.8GB non-unique
+
+dirs="ai-en-nf vehicles-en-nf plants-en-nf"
+for d in $dirs; do jobs=$(ls -tr $d | grep job); for j in $jobs; do files=$(ls -tr $d/$j/sentences/ | grep "HTML.*\.txt\.gz"); for f in $files; do zcat $d/$j/sentences/$f | cut -f2,5 | gzip -c >> $d/crawl-sentences.txt.gz ; done ; done & done
+
+# check size
+fun () { zcat $1 | wc > $1.wc; }; for d in $dirs; do fun $d/crawl-sentences.txt.gz & done
+
+dirs="ai-en-nf plants-en-nf vehicles-en-nf"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done
+
+# combine function with find command
+fun () { echo $1; ls -lah $1; }; find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d; done
+
+# sync to farnsworth 	
+fun () { rsync -avvzhP $1/crawl-sentences* fw:data/semeval-2015-task17-texeval/$1/ ;  }
+find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d & done
+
+
+
 
diff --git a/lt.seg/README.MD b/lt.seg/README.MD
@@ -66,16 +66,18 @@ lt.seg comes with a number of parameters, run `seg -?` to get a list of options
 * `--normalize <level>` (`-nl`)
     * `0` (default): no normalization, each segment will be printed as it is in the input
     * `1`: reduce same consecutive non-word characters, e.g. multiple consecutive blanks will be merged to one. Example: "\t\t\n\t\t" -> "\t\n\t"
-    * `2`: `1` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
-    * `3`: `2` + replace all non-word segments with its symbol.
-    * `4`: `3` + lowercase words.
+    * `2`: `1` + replace empty space and punctuation characters with its symbol 
+    * `3`: `2` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
+    * `4`: `3` + replace all non-word segments with its symbol.
+    * `5`: `4` + lowercase words.
 * `--filter <level>` (`-fl`): *Note: examples below use normalization level (-nl)* `2` and DiffTokenizer
     * `0`: no filtering, each segment will be printed separated by blanks (this also includes emptyspace segments, in most cases you probably want to use at least `1` or `2`)
 	* `1`: filter control character segments
     * `2`: (default): `1` + filter emptyspace segments
     * `3`: `2` + filter unclassified and non-readable segments (attention: results heavily depend on tokenizer)
-    * `4`: `3` + filter punctuation characters. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0"
-    * `5`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
+    * `4`: `3` + filter punctuation characters
+    * `5`: `4` + filter meta data like URLs, file descriptors, emails, wiki markup, emoticons, etc.
+    * `6`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
 * `--merge [<level>]` (`-ml`): *Note: examples below use normalization level (-nl)* `2`
 	* `0`: no merging (default when not specified)
 	* `1`: merge same consecutive token types if they are not words or words with numbers (default when just -ml specified). Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0 . "

diff --git a/lt.seg/pom.xml b/lt.seg/pom.xml
@@ -5,7 +5,7 @@
 	<parent>
 	  <groupId>de.tudarmstadt</groupId>
 	  <artifactId>lt.kd-suite</artifactId>
-	  <version>0.7.0</version>
+	  <version>0.7.1</version>
 	</parent>
 
 	<artifactId>lt.seg</artifactId>

diff --git a/lt.seg/src/main/java/de/tudarmstadt/lt/seg/Segment.java b/lt.seg/src/main/java/de/tudarmstadt/lt/seg/Segment.java
@@ -15,6 +15,7 @@
  */
 package de.tudarmstadt.lt.seg;
 
+import java.util.EnumSet;
 
 /**
  * 
@@ -40,6 +41,11 @@ public class Segment {
 	public boolean hasZeroLength(){
 		return begin == end;
 	}
+
+	public int length(){
+		assert(text.length() == end - begin);
+		return end - begin;
+	}
 
 	public String asString(){
 		return text.toString();
@@ -49,7 +55,7 @@ public String asNormalizedString(int level){
 
 		String result = text.toString();
 
-		if(level >= 1 && type == SegmentType.NON_WORD){ // reduce non-word characters
+		if(level >= 1 && (type == SegmentType.NON_WORD || type == SegmentType.UNKNOWN)){ // reduce non-word characters
 			StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
 				if(x.length() == 0 || x.codePointBefore(x.length()) != y)
 					return x.appendCodePoint(y);
@@ -59,8 +65,17 @@ public String asNormalizedString(int level){
 			});
 			result = b.toString();
 		}
+
+		if(level >= 1 && type == SegmentType.CONTROL){ // reduce non-word characters
+			result = type.symbol();
+		}
+
+		if(level >= 2 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCT)){
+			// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
+			result = type.symbol();
+		}
 
-		if(level >= 2){
+		if(level >= 3){
 			if(type == SegmentType.WORD_WITH_NUMBER){ // replace consecutive digits within a word
 				StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
 					if(x.length() == 0){
@@ -81,16 +96,16 @@ public String asNormalizedString(int level){
 				});
 				result = b.toString();
 			}
-			if(type == SegmentType.NUMBER)
+			if(EnumSet.of(SegmentType.NUMBER, SegmentType.DATE, SegmentType.PHONE, SegmentType.TIME).contains(type))
 				result = type.symbol();
 		}
 
-		if(level >= 3 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCTUATION)){
-			// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
-			result = type.symbol();
+		if(level >= 4){
+			if(EnumSet.complementOf(EnumSet.of(SegmentType.WORD, SegmentType.WORD_LOWERCASE, SegmentType.WORD_UPPERCASE, SegmentType.WORD_WITH_NUMBER, SegmentType.SENTENCE, SegmentType.ABBRV, SegmentType.PARAGRAPH, SegmentType.TEXT)).contains(type))
+				result = type.symbol();
 		}
 
-		if(level >= 4)
+		if(level >= 5)
 			result = result.toLowerCase();
 
 		return result;
@@ -104,19 +119,31 @@ public boolean isEmpty(){
 		return type == SegmentType.EMPTY_SPACE; 
 	}
 
+	public boolean isPartOfSentence(){
+		return type == SegmentType.SENTENCE || type == SegmentType.SENTENCE_BOUNDARY; 
+	}
+
 	public boolean isWord(){
 		return type == SegmentType.WORD ||   
 				type == SegmentType.WORD_UPPERCASE ||
 				type == SegmentType.WORD_LOWERCASE;
 	}
 
+	// TODO: replace with type.ordninal in range [x,y]. needs reordering of Segmenttypes 
 	public boolean isReadable(){
-		return type == SegmentType.WORD || 
-				type == SegmentType.NUMBER || 
-				type == SegmentType.WORD_WITH_NUMBER|| 
-				type == SegmentType.WORD_UPPERCASE ||
-				type == SegmentType.WORD_LOWERCASE ||
-				type == SegmentType.PUNCTUATION;  
+//		SegmentType.WORD,
+//		SegmentType.NUMBER,
+//		SegmentType.WORD_WITH_NUMBER, 
+//		SegmentType.WORD_UPPERCASE,
+//		SegmentType.WORD_LOWERCASE,
+//		SegmentType.PUNCT
+		return EnumSet.complementOf(EnumSet.of(
+					SegmentType.CONTROL,
+					SegmentType.UNKNOWN,
+					SegmentType.EMPTY_SPACE
+				)).contains(type); 
+
+
 	}
 
 	/* (non-Javadoc)

diff --git a/lt.seg/src/main/java/de/tudarmstadt/lt/seg/SegmentType.java b/lt.seg/src/main/java/de/tudarmstadt/lt/seg/SegmentType.java
@@ -28,14 +28,16 @@
  */
 public enum SegmentType {
 
-	TEXT("\u00a7", SegmentationUtils.chartypes),
 
-	CONTROL("\ufffd",
+
+	CONTROL("¶",
 			Character.CONTROL,
 			Character.FORMAT,
 			Character.SURROGATE),
 
-	PARAGRAPH("\u00b6",
+	TEXT("T", SegmentationUtils.chartypes),
+
+	PARAGRAPH("§",
 		Arrays.stream(SegmentationUtils.chartypes).filter( x -> 
 		x != Character.PARAGRAPH_SEPARATOR).toArray()),
 
@@ -46,7 +48,6 @@ public enum SegmentType {
 		x != Character.START_PUNCTUATION && 
 		x != Character.END_PUNCTUATION).toArray()),
 
-	// \u2022
 	SENTENCE_BOUNDARY(". ",
 			Character.PARAGRAPH_SEPARATOR,
 			Character.DASH_PUNCTUATION,
@@ -105,8 +106,7 @@ public enum SegmentType {
 			Character.OTHER_LETTER
 			),
 
-	// \u2235 \u00b7
-	PUNCTUATION(".",
+	PUNCT(".",
 			Character.NON_SPACING_MARK,
 			Character.ENCLOSING_MARK,
 			Character.COMBINING_SPACING_MARK,
@@ -125,15 +125,37 @@ public enum SegmentType {
 			Character.CONTROL
 			),
 
-	NON_WORD("\u20a9",
+	NON_WORD("₩",
 		Arrays.stream(SegmentationUtils.chartypes).filter(x -> 
 		x != Character.UPPERCASE_LETTER &&
 		x != Character.LOWERCASE_LETTER &&
 		x != Character.TITLECASE_LETTER && 
 		x != Character.MODIFIER_LETTER &&
 		x != Character.OTHER_LETTER).toArray()),
 
-	UNKNOWN("\u2e2e");
+	UNKNOWN("�"),
+
+	EMAIL("📧"),
+
+	DATE("📅"),
+
+	TIME("⌚"),
+
+	PHONE("☎"),
+
+	META("📓"),
+
+	EMO("☺"),  
+
+	URI("💻"),
+
+	ABBRV("㍱"),
+
+	REF("˃"),
+
+	UNSPECIFIED("⸮"),
+
+	;
 
 	public static EnumSet<SegmentType> TOKEN_TYPES = EnumSet.range(SegmentType.WORD,  SegmentType.NON_WORD);
 

diff --git a/lt.seg/src/main/java/de/tudarmstadt/lt/seg/app/Segmenter.java b/lt.seg/src/main/java/de/tudarmstadt/lt/seg/app/Segmenter.java
@@ -90,6 +90,7 @@ public Segmenter() {/* NOTHING TO DO */}
 		opts.addOption(OptionBuilder.withLongOpt("onedocperline").withDescription("Specify if you want to process documents linewise and preserve document ids, i.e. map line numbers to sentences.").create("l"));
 		opts.addOption(OptionBuilder.withLongOpt("sentence-ruleset").withArgName("languagecode").hasArg().withDescription(String.format("Specify the ruleset that you want to use together with RuleSplitter (avaliable: %s) (default: 'default')", de.tudarmstadt.lt.seg.sentence.rules.RuleSet.getAvailable())).create());
 		opts.addOption(OptionBuilder.withLongOpt("token-ruleset").withArgName("languagecode").hasArg().withDescription(String.format("Specify the ruleset that you want to use together with RuleTokenizer (avaliable: %s) (default: 'default')", de.tudarmstadt.lt.seg.token.rules.RuleSet.getAvailable())).create());
+		opts.addOption(OptionBuilder.withLongOpt("boundary-as-part-of-sentence").withDescription("Specify if sentence boundaries should be part of the sentence segment (default: true).").hasArg().withArgName("true|false").create("bps"));
 		opts.addOption(OptionBuilder.withLongOpt("debug").withDescription("Enable debugging.").create());
 	}
 
@@ -121,6 +122,7 @@ public Segmenter(String[] args) {
 			_one_doc_per_line =			cmd.hasOption("l");
 			_ruleset_sentence =			cmd.getOptionValue("sentence-ruleset");
 			_ruleset_token =			cmd.getOptionValue("token-ruleset");
+			_boundary_as_part_of_sentence = Boolean.parseBoolean(cmd.getOptionValue("boundary-as-part-of-sentence","true"));
 
 			DEBUG =						cmd.hasOption("debug");
 			if(DEBUG){
@@ -150,6 +152,7 @@ public Segmenter(String[] args) {
 	boolean _one_doc_per_line;
 	boolean _merge_types;
 	boolean _merge_tokens;
+	boolean _boundary_as_part_of_sentence;
 
 	/* (non-Javadoc)
 	 * @see java.lang.Runnable#run()
@@ -312,6 +315,11 @@ public static void split_and_tokenize(Reader reader, String docid, ISentenceSpli
 		try{
 			final StringBuffer buf = new StringBuffer(); // used for checking of stream is empty; take care when not running sequentially but in parallel!
 			sentenceSplitter.init(reader).stream().sequential().forEach(sentence_segment -> {
+				if(DEBUG){
+					writer.format("%s%s", docid, separator_desc);
+					writer.println(sentence_segment.toString());
+					writer.print(separator_sentence);
+				}
 				if(sentence_segment.type != SegmentType.SENTENCE)
 					return;
 				tokenizer.init(sentence_segment.asString());
@@ -336,7 +344,11 @@ public static void split_and_tokenize(Reader reader, String docid, ISentenceSpli
 				}
 			});
 		}catch(Exception e){
-			System.err.format("%s: ", e.getClass(), e.getMessage());
+			Throwable t = e;
+			while(t != null){
+				System.err.format("%s: %s%n", e.getClass(), e.getMessage());
+				t = e.getCause();
+			}
 		}
 	}
 
@@ -347,7 +359,7 @@ public ITokenizer newTokenizer() throws ClassNotFoundException, InstantiationExc
 		ITokenizer instance = clazz.newInstance();
 		if(RuleTokenizer.class.getSimpleName().equals(_tokenizer_type)){
 			de.tudarmstadt.lt.seg.token.rules.RuleSet rs = de.tudarmstadt.lt.seg.token.rules.RuleSet.get(_ruleset_sentence);
-			((RuleTokenizer)instance).init(rs);
+			((RuleTokenizer)instance).initParam(rs);
 		}
 		return instance;
 	}
@@ -359,7 +371,7 @@ public ISentenceSplitter newSentenceSplitter() throws ClassNotFoundException, In
 		ISentenceSplitter instance = clazz.newInstance();
 		if(RuleSplitter.class.getSimpleName().equals(_sentence_splitter_type)){
 			de.tudarmstadt.lt.seg.sentence.rules.RuleSet rs = de.tudarmstadt.lt.seg.sentence.rules.RuleSet.get(_ruleset_sentence);
-			((RuleSplitter)instance).init(rs);
+			((RuleSplitter)instance).initParam(rs, _boundary_as_part_of_sentence);
 		}
 		return instance;
 	}