Skip to content

Commit

Permalink
udated to new segmenter version
Browse files Browse the repository at this point in the history
  • Loading branch information
remstef committed Mar 14, 2017
1 parent f8ca9a3 commit 6b25bd1
Show file tree
Hide file tree
Showing 36 changed files with 641 additions and 192 deletions.
3 changes: 1 addition & 2 deletions install.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

VERSION=0.7.0
VERSION=0.7.1
DIR="topicrawler-${VERSION}"

mkdir -p ${DIR}
Expand All @@ -27,4 +27,3 @@ tar -xvzf lt.ltbot-*.tar.gz --strip-components 1 -C heritrix-3.2.0




2 changes: 1 addition & 1 deletion lt.lm/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.kd-suite</artifactId>
<version>0.7.0</version>
<version>0.7.1</version>
</parent>

<artifactId>lt.lm</artifactId>
Expand Down
7 changes: 5 additions & 2 deletions lt.lm/src/main/sh/lm
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,14 @@ lib_dir=${lmhome}/lib; while IFS= read -r -d '' f; do cp=${cp}:"${f}"; done < <(
cp=${cp:1} # remove heading colon

# skip all -D.. and -X.. parameters before the actual main class and add them later to JAVA_OPTS
DX=''
DX=""
while [[ $1 == -D* || $1 == -X* ]]; do
DX="$DX $1"
DX="$DX "$(printf '%q' "${1}") # quote args
shift
done
JAVA_OPTS="$JAVA_OPTS $DX"
args=""
for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args

mainclass=${1:-MainFinder} # set main class if no further arg was provided
if [ ! -z "$1" ]; then # if at least one arg was passed
Expand Down
7 changes: 6 additions & 1 deletion lt.lm/src/main/sh/lm-nightly
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,9 @@ lmsrc="${kdhome}/lt.lm"

tgt=$(find "${lmsrc}/target" -type f -name "lm" | grep "dist/" | head -n1)

eval "${tgt} $@"
args=""
for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args

cmd="${tgt} "${args}

eval "${cmd}"
2 changes: 1 addition & 1 deletion lt.ltbot/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.kd-suite</artifactId>
<version>0.7.0</version>
<version>0.7.1</version>
</parent>

<artifactId>lt.ltbot</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ public String getTargetLanguageCode(){
}
public void setTargetLanguageCode(String target_language_code){
_target_language_code = target_language_code;
_rule_splitter.get().initParam(_target_language_code, false);
}

public Stream<String> getSentencesStream(String text){
Expand All @@ -67,7 +68,7 @@ public Stream<String> getSentencesStream(String text){

public Stream<String> getSentencesStream(String text, String languagecode){
return _line_splitter.get().init(new StringReader(text)).stream().filter(s -> s.type == SegmentType.SENTENCE).map(Segment::asString).flatMap(line -> {
return _rule_splitter.get().init(new StringReader(line), languagecode).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
return _rule_splitter.get().init(new StringReader(line)).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
final AtomicInteger c = new AtomicInteger();
String r = _tokenizer.get().init(s.asString()).stream().sequential().map(t -> {
if(t.isWord())
Expand Down
23 changes: 23 additions & 0 deletions lt.ltbot/src/test/scripts/prepare_eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,27 @@ for t in ai-en vehicles-en plants-en; do for m in nf 1 2 3 5; do d=$b/$t-$m; cat
find . -name docperps.tsv | while read f; do d=$(dirname $f); cat $f | perl -F\\t -lanE 'if($F[1] =~ /.*[0-9].*/ && $F[1]<1e4){print $_}' > $d/docperps-pr1e4.tsv & done


dirs="ai-en-5 plants-en-5 vehicles-en-5"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done

### steffen@farnsworth:/mnt/farnsworthshare/semeval-2015-task17-texeval/wiki$ zcat wikipedia.txt.gz | wc
### 109136195 1868200724 11792896391
### wikipedia 11.8GB non-unique

dirs="ai-en-nf vehicles-en-nf plants-en-nf"
for d in $dirs; do jobs=$(ls -tr $d | grep job); for j in $jobs; do files=$(ls -tr $d/$j/sentences/ | grep "HTML.*\.txt\.gz"); for f in $files; do zcat $d/$j/sentences/$f | cut -f2,5 | gzip -c >> $d/crawl-sentences.txt.gz ; done ; done & done

# check size
fun () { zcat $1 | wc > $1.wc; }; for d in $dirs; do fun $d/crawl-sentences.txt.gz & done

dirs="ai-en-nf plants-en-nf vehicles-en-nf"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done

# combine function with find command
fun () { echo $1; ls -lah $1; }; find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d; done

# sync to farnsworth
fun () { rsync -avvzhP $1/crawl-sentences* fw:data/semeval-2015-task17-texeval/$1/ ; }
find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d & done




12 changes: 7 additions & 5 deletions lt.seg/README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,18 @@ lt.seg comes with a number of parameters, run `seg -?` to get a list of options
* `--normalize <level>` (`-nl`)
* `0` (default): no normalization, each segment will be printed as it is in the input
* `1`: reduce same consecutive non-word characters, e.g. multiple consecutive blanks will be merged to one. Example: "\t\t\n\t\t" -> "\t\n\t"
* `2`: `1` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
* `3`: `2` + replace all non-word segments with its symbol.
* `4`: `3` + lowercase words.
* `2`: `1` + replace empty space and punctuation characters with its symbol
* `3`: `2` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
* `4`: `3` + replace all non-word segments with its symbol.
* `5`: `4` + lowercase words.
* `--filter <level>` (`-fl`): *Note: examples below use normalization level (-nl)* `2` and DiffTokenizer
* `0`: no filtering, each segment will be printed separated by blanks (this also includes emptyspace segments, in most cases you probably want to use at least `1` or `2`)
* `1`: filter control character segments
* `2`: (default): `1` + filter emptyspace segments
* `3`: `2` + filter unclassified and non-readable segments (attention: results heavily depend on tokenizer)
* `4`: `3` + filter punctuation characters. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0"
* `5`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
* `4`: `3` + filter punctuation characters
* `5`: `4` + filter meta data like URLs, file descriptors, emails, wiki markup, emoticons, etc.
* `6`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
* `--merge [<level>]` (`-ml`): *Note: examples below use normalization level (-nl)* `2`
* `0`: no merging (default when not specified)
* `1`: merge same consecutive token types if they are not words or words with numbers (default when just -ml specified). Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0 . "
Expand Down
2 changes: 1 addition & 1 deletion lt.seg/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<groupId>de.tudarmstadt</groupId>
<artifactId>lt.kd-suite</artifactId>
<version>0.7.0</version>
<version>0.7.1</version>
</parent>

<artifactId>lt.seg</artifactId>
Expand Down
53 changes: 40 additions & 13 deletions lt.seg/src/main/java/de/tudarmstadt/lt/seg/Segment.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
package de.tudarmstadt.lt.seg;

import java.util.EnumSet;

/**
*
Expand All @@ -40,6 +41,11 @@ public class Segment {
public boolean hasZeroLength(){
return begin == end;
}

public int length(){
assert(text.length() == end - begin);
return end - begin;
}

public String asString(){
return text.toString();
Expand All @@ -49,7 +55,7 @@ public String asNormalizedString(int level){

String result = text.toString();

if(level >= 1 && type == SegmentType.NON_WORD){ // reduce non-word characters
if(level >= 1 && (type == SegmentType.NON_WORD || type == SegmentType.UNKNOWN)){ // reduce non-word characters
StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
if(x.length() == 0 || x.codePointBefore(x.length()) != y)
return x.appendCodePoint(y);
Expand All @@ -59,8 +65,17 @@ public String asNormalizedString(int level){
});
result = b.toString();
}

if(level >= 1 && type == SegmentType.CONTROL){ // reduce non-word characters
result = type.symbol();
}

if(level >= 2 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCT)){
// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
result = type.symbol();
}

if(level >= 2){
if(level >= 3){
if(type == SegmentType.WORD_WITH_NUMBER){ // replace consecutive digits within a word
StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
if(x.length() == 0){
Expand All @@ -81,16 +96,16 @@ public String asNormalizedString(int level){
});
result = b.toString();
}
if(type == SegmentType.NUMBER)
if(EnumSet.of(SegmentType.NUMBER, SegmentType.DATE, SegmentType.PHONE, SegmentType.TIME).contains(type))
result = type.symbol();
}

if(level >= 3 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCTUATION)){
// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
result = type.symbol();
if(level >= 4){
if(EnumSet.complementOf(EnumSet.of(SegmentType.WORD, SegmentType.WORD_LOWERCASE, SegmentType.WORD_UPPERCASE, SegmentType.WORD_WITH_NUMBER, SegmentType.SENTENCE, SegmentType.ABBRV, SegmentType.PARAGRAPH, SegmentType.TEXT)).contains(type))
result = type.symbol();
}

if(level >= 4)
if(level >= 5)
result = result.toLowerCase();

return result;
Expand All @@ -104,19 +119,31 @@ public boolean isEmpty(){
return type == SegmentType.EMPTY_SPACE;
}

public boolean isPartOfSentence(){
return type == SegmentType.SENTENCE || type == SegmentType.SENTENCE_BOUNDARY;
}

public boolean isWord(){
return type == SegmentType.WORD ||
type == SegmentType.WORD_UPPERCASE ||
type == SegmentType.WORD_LOWERCASE;
}

// TODO: replace with type.ordninal in range [x,y]. needs reordering of Segmenttypes
public boolean isReadable(){
return type == SegmentType.WORD ||
type == SegmentType.NUMBER ||
type == SegmentType.WORD_WITH_NUMBER||
type == SegmentType.WORD_UPPERCASE ||
type == SegmentType.WORD_LOWERCASE ||
type == SegmentType.PUNCTUATION;
// SegmentType.WORD,
// SegmentType.NUMBER,
// SegmentType.WORD_WITH_NUMBER,
// SegmentType.WORD_UPPERCASE,
// SegmentType.WORD_LOWERCASE,
// SegmentType.PUNCT
return EnumSet.complementOf(EnumSet.of(
SegmentType.CONTROL,
SegmentType.UNKNOWN,
SegmentType.EMPTY_SPACE
)).contains(type);


}

/* (non-Javadoc)
Expand Down
38 changes: 30 additions & 8 deletions lt.seg/src/main/java/de/tudarmstadt/lt/seg/SegmentType.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,16 @@
*/
public enum SegmentType {

TEXT("\u00a7", SegmentationUtils.chartypes),

CONTROL("\ufffd",

CONTROL("¶",
Character.CONTROL,
Character.FORMAT,
Character.SURROGATE),

PARAGRAPH("\u00b6",
TEXT("T", SegmentationUtils.chartypes),

PARAGRAPH("§",
Arrays.stream(SegmentationUtils.chartypes).filter( x ->
x != Character.PARAGRAPH_SEPARATOR).toArray()),

Expand All @@ -46,7 +48,6 @@ public enum SegmentType {
x != Character.START_PUNCTUATION &&
x != Character.END_PUNCTUATION).toArray()),

// \u2022
SENTENCE_BOUNDARY(". ",
Character.PARAGRAPH_SEPARATOR,
Character.DASH_PUNCTUATION,
Expand Down Expand Up @@ -105,8 +106,7 @@ public enum SegmentType {
Character.OTHER_LETTER
),

// \u2235 \u00b7
PUNCTUATION(".",
PUNCT(".",
Character.NON_SPACING_MARK,
Character.ENCLOSING_MARK,
Character.COMBINING_SPACING_MARK,
Expand All @@ -125,15 +125,37 @@ public enum SegmentType {
Character.CONTROL
),

NON_WORD("\u20a9",
NON_WORD("",
Arrays.stream(SegmentationUtils.chartypes).filter(x ->
x != Character.UPPERCASE_LETTER &&
x != Character.LOWERCASE_LETTER &&
x != Character.TITLECASE_LETTER &&
x != Character.MODIFIER_LETTER &&
x != Character.OTHER_LETTER).toArray()),

UNKNOWN("\u2e2e");
UNKNOWN("�"),

EMAIL("📧"),

DATE("📅"),

TIME("⌚"),

PHONE("☎"),

META("📓"),

EMO("☺"),

URI("💻"),

ABBRV("㍱"),

REF("˃"),

UNSPECIFIED("⸮"),

;

public static EnumSet<SegmentType> TOKEN_TYPES = EnumSet.range(SegmentType.WORD, SegmentType.NON_WORD);

Expand Down
18 changes: 15 additions & 3 deletions lt.seg/src/main/java/de/tudarmstadt/lt/seg/app/Segmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ public Segmenter() {/* NOTHING TO DO */}
opts.addOption(OptionBuilder.withLongOpt("onedocperline").withDescription("Specify if you want to process documents linewise and preserve document ids, i.e. map line numbers to sentences.").create("l"));
opts.addOption(OptionBuilder.withLongOpt("sentence-ruleset").withArgName("languagecode").hasArg().withDescription(String.format("Specify the ruleset that you want to use together with RuleSplitter (avaliable: %s) (default: 'default')", de.tudarmstadt.lt.seg.sentence.rules.RuleSet.getAvailable())).create());
opts.addOption(OptionBuilder.withLongOpt("token-ruleset").withArgName("languagecode").hasArg().withDescription(String.format("Specify the ruleset that you want to use together with RuleTokenizer (avaliable: %s) (default: 'default')", de.tudarmstadt.lt.seg.token.rules.RuleSet.getAvailable())).create());
opts.addOption(OptionBuilder.withLongOpt("boundary-as-part-of-sentence").withDescription("Specify if sentence boundaries should be part of the sentence segment (default: true).").hasArg().withArgName("true|false").create("bps"));
opts.addOption(OptionBuilder.withLongOpt("debug").withDescription("Enable debugging.").create());
}

Expand Down Expand Up @@ -121,6 +122,7 @@ public Segmenter(String[] args) {
_one_doc_per_line = cmd.hasOption("l");
_ruleset_sentence = cmd.getOptionValue("sentence-ruleset");
_ruleset_token = cmd.getOptionValue("token-ruleset");
_boundary_as_part_of_sentence = Boolean.parseBoolean(cmd.getOptionValue("boundary-as-part-of-sentence","true"));

DEBUG = cmd.hasOption("debug");
if(DEBUG){
Expand Down Expand Up @@ -150,6 +152,7 @@ public Segmenter(String[] args) {
boolean _one_doc_per_line;
boolean _merge_types;
boolean _merge_tokens;
boolean _boundary_as_part_of_sentence;

/* (non-Javadoc)
* @see java.lang.Runnable#run()
Expand Down Expand Up @@ -312,6 +315,11 @@ public static void split_and_tokenize(Reader reader, String docid, ISentenceSpli
try{
final StringBuffer buf = new StringBuffer(); // used for checking of stream is empty; take care when not running sequentially but in parallel!
sentenceSplitter.init(reader).stream().sequential().forEach(sentence_segment -> {
if(DEBUG){
writer.format("%s%s", docid, separator_desc);
writer.println(sentence_segment.toString());
writer.print(separator_sentence);
}
if(sentence_segment.type != SegmentType.SENTENCE)
return;
tokenizer.init(sentence_segment.asString());
Expand All @@ -336,7 +344,11 @@ public static void split_and_tokenize(Reader reader, String docid, ISentenceSpli
}
});
}catch(Exception e){
System.err.format("%s: ", e.getClass(), e.getMessage());
Throwable t = e;
while(t != null){
System.err.format("%s: %s%n", e.getClass(), e.getMessage());
t = e.getCause();
}
}
}

Expand All @@ -347,7 +359,7 @@ public ITokenizer newTokenizer() throws ClassNotFoundException, InstantiationExc
ITokenizer instance = clazz.newInstance();
if(RuleTokenizer.class.getSimpleName().equals(_tokenizer_type)){
de.tudarmstadt.lt.seg.token.rules.RuleSet rs = de.tudarmstadt.lt.seg.token.rules.RuleSet.get(_ruleset_sentence);
((RuleTokenizer)instance).init(rs);
((RuleTokenizer)instance).initParam(rs);
}
return instance;
}
Expand All @@ -359,7 +371,7 @@ public ISentenceSplitter newSentenceSplitter() throws ClassNotFoundException, In
ISentenceSplitter instance = clazz.newInstance();
if(RuleSplitter.class.getSimpleName().equals(_sentence_splitter_type)){
de.tudarmstadt.lt.seg.sentence.rules.RuleSet rs = de.tudarmstadt.lt.seg.sentence.rules.RuleSet.get(_ruleset_sentence);
((RuleSplitter)instance).init(rs);
((RuleSplitter)instance).initParam(rs, _boundary_as_part_of_sentence);
}
return instance;
}
Expand Down
Loading

0 comments on commit 6b25bd1

Please sign in to comment.