Skip to content

Commit

Permalink
Merge pull request #676 from mayhewsw/taformat
Browse files Browse the repository at this point in the history
Added the ability to use JSON Serialized Format with NerTagger
  • Loading branch information
Daniel Khashabi authored Aug 2, 2018
2 parents 267fdcb + 6f0768d commit 25d3e70
Show file tree
Hide file tree
Showing 6 changed files with 261 additions and 24 deletions.
5 changes: 3 additions & 2 deletions ner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,9 @@ Where the parameters are:
- this file is used for parameter tuning of the training, use the training file if you don't have a development set (use the same file both for training and for development)
- files-format can be either:
- -c (for column format) or
- -r (for brackets format.
- See below for more information on the formats). Both the training and the development files have to be in the same format.
- -r (for brackets format)
- -json (for JSON-Serialized [TextAnnotation](https://github.com/CogComp/cogcomp-nlp/blob/master/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TextAnnotation.java) format (see [SerializationHelper](https://github.com/CogComp/cogcomp-nlp/blob/master/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/utilities/SerializationHelper.java) for more details)
- See below for more information on the formats. Both the training and the development files have to be in the same format.
Complete, working example. Before running this, open [`config/ner.properties`](config/ner.properties) and change the `pathToModelFile` to
something else (for example, `ner/mymodels`). This will prevent it from attempting to overwrite the jar.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import edu.illinois.cs.cogcomp.lbjava.learn.BatchTrainer;
import edu.illinois.cs.cogcomp.lbjava.learn.SparseAveragedPerceptron;
import edu.illinois.cs.cogcomp.lbjava.learn.SparseNetworkLearner;
import edu.illinois.cs.cogcomp.lbjava.learn.featurepruning.SparseNetworkOptimizer;
import edu.illinois.cs.cogcomp.lbjava.parse.Parser;
import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.ExpressiveFeaturesAnnotator;
import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.TwoLayerPredictionAggregationFeatures;
Expand Down Expand Up @@ -59,20 +58,36 @@ public static void buildFinalModel(int fixedNumIterations, String trainDataPath,
getLearningCurve(train, test, fixedNumIterations);
}

/**
* Convenience function that has a default value for dataFormat of -c
* @param fixedNumIterations
* @param trainDataPath
* @param testDataPath
* @throws Exception
*/
public static void getLearningCurve(int fixedNumIterations, String trainDataPath,
String testDataPath) throws Exception {
getLearningCurve(fixedNumIterations, trainDataPath, "-c", testDataPath);
}

/**
* train a model with the specified inputs, evaluate with the specified test data
* <p>
* use fixedNumIterations=-1 if you want to use the automatic convergence criterion
* Use fixedNumIterations=-1 if you want to use the automatic convergence criterion
* </p>
* <p>
* In practice, testDataPath should be a Development set.
* </p>
*/
public static void getLearningCurve(int fixedNumIterations, String trainDataPath,
String testDataPath) throws Exception {
public static void getLearningCurve(int fixedNumIterations, String dataFormat, String trainDataPath,
String testDataPath) throws Exception {
logger.debug("getLearningCurve(): fni = " + fixedNumIterations + " trainDataPath = '"
+ trainDataPath + "' testDataPath = '" + testDataPath + "'....");
Data trainData =
new Data(trainDataPath, trainDataPath, "-c", new String[] {}, new String[] {});
new Data(trainDataPath, trainDataPath, dataFormat, new String[] {}, new String[] {});
ExpressiveFeaturesAnnotator.annotate(trainData);
Data testData =
new Data(testDataPath, testDataPath, "-c", new String[] {}, new String[] {});
new Data(testDataPath, testDataPath, dataFormat, new String[] {}, new String[] {});
ExpressiveFeaturesAnnotator.annotate(testData);
Vector<Data> train = new Vector<>();
train.addElement(trainData);
Expand Down Expand Up @@ -154,6 +169,7 @@ public static void getLearningCurve(Vector<Data> trainDataSet, Vector<Data> test
TestDiscrete simpleTest = new TestDiscrete();
simpleTest.addNull("O");
TestDiscrete.testDiscrete(simpleTest, tagger1, null, testParser1, true, 0);

double f1Level1 = simpleTest.getOverallStats()[2];
if (f1Level1 > bestF1Level1) {
bestF1Level1 = f1Level1;
Expand Down Expand Up @@ -249,6 +265,7 @@ public static void getLearningCurve(Vector<Data> trainDataSet, Vector<Data> test
+ "\t Level2: bestround=" + bestRoundLevel2 + "\t F1=" + bestF1Level2);
}

NETesterMultiDataset.printTestResultsByDataset(testDataSet, tagger1, tagger2, true);

/*
* This will override the models forcing to save the iteration we're interested in- the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,25 @@ public class NETesterMultiDataset {
* NB: assuming column format
*/
public static void test(String testDatapath, boolean verbose,
Vector<String> labelsToIgnoreInEvaluation, Vector<String> labelsToAnonymizeInEvaluation)
Vector<String> labelsToIgnoreInEvaluation, Vector<String> labelsToAnonymizeInEvaluation)
throws Exception {
test(testDatapath,verbose, "-c", labelsToIgnoreInEvaluation, labelsToAnonymizeInEvaluation);
}

/**
* Allows format to be specified.
* @param testDatapath
* @param verbose
* @param dataFormat
* @param labelsToIgnoreInEvaluation
* @param labelsToAnonymizeInEvaluation
* @throws Exception
*/
public static void test(String testDatapath, boolean verbose, String dataFormat,
Vector<String> labelsToIgnoreInEvaluation, Vector<String> labelsToAnonymizeInEvaluation)
throws Exception {
Data testData =
new Data(testDatapath, testDatapath, "-c", new String[] {}, new String[] {});
new Data(testDatapath, testDatapath, dataFormat, new String[] {}, new String[] {});
ExpressiveFeaturesAnnotator.annotate(testData);
Vector<Data> data = new Vector<>();
data.addElement(testData);
Expand Down
31 changes: 23 additions & 8 deletions ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerTagger.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,16 @@ public static void main(String[] args) {
ResourceManager rm = new ResourceManager(args[args.length - 1]);
Parameters.readConfigAndLoadExternalData(args[args.length - 1], areWeTraining);

if (args[0].equalsIgnoreCase("-train"))
LearningCurveMultiDataset.getLearningCurve(-1, args[1], args[2]);
else if (args[0].equalsIgnoreCase("-trainFixedIterations"))
if (args[0].equalsIgnoreCase("-train")) {
String dataFormat;
// config file is always the last one.
if(args.length < 5){
dataFormat = "-c";
}else{
dataFormat = args[3];
}
LearningCurveMultiDataset.getLearningCurve(-1, dataFormat, args[1], args[2]);
}else if (args[0].equalsIgnoreCase("-trainFixedIterations"))
LearningCurveMultiDataset.getLearningCurve(Integer.parseInt(args[1]), args[2],
args[3]);
else {
Expand Down Expand Up @@ -68,10 +75,17 @@ else if (args[0].equalsIgnoreCase("-trainFixedIterations"))
logger.info(output.toString());
}
}
if (args[0].equalsIgnoreCase("-test"))
NETesterMultiDataset.test(args[1], false, cp.labelsToIgnoreInEvaluation,
if (args[0].equalsIgnoreCase("-test")) {
String dataFormat;
// config file is always the last one.
if(args.length < 4){
dataFormat = "-c";
}else{
dataFormat = args[2];
}
NETesterMultiDataset.test(args[1], true, dataFormat, cp.labelsToIgnoreInEvaluation,
cp.labelsToAnonymizeInEvaluation);
if (args[0].equalsIgnoreCase("-dumpFeatures"))
}if (args[0].equalsIgnoreCase("-dumpFeatures"))
NETesterMultiDataset.dumpFeaturesLabeledData(args[1], args[2]);
}
} catch (Exception e) {
Expand All @@ -87,9 +101,10 @@ private static void printUsage(PrintStream out) {
"Usage: edu.illinois.cs.cogcomp.ner.NerTagger <command> [options] <config-file>\n";
usage +=
"commands:\n" + "\t-demo\n" + "\t-annotate <input-dir> <output-dir>\n"
+ "\t-train <train-dir> <test-dir>\n"
+ "\t-train <train-dir> <test-dir> <dataformat = {-c, -r, -json}, -c by default>\n"
+ "\t-trainFixedIterations <num-iters> <train-dir> <test-dir>\n"
+ "\t-test <test-dir>\n" + "\t-dumpFeatures <test-dir> <output-dir>";
+ "\t-test <test-dir> <dataformat = {-c, -r, -json}, -c by default>\n"
+ "\t-dumpFeatures <test-dir> <output-dir>";
out.println(usage);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
*/
package edu.illinois.cs.cogcomp.ner.ParsingProcessingData;

import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper;
import edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument;
import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord;
import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode;
Expand All @@ -15,6 +17,7 @@
import org.slf4j.LoggerFactory;

import java.io.File;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Vector;
Expand Down Expand Up @@ -57,6 +60,9 @@ public static Vector<NERDocument> readFolder(String path, String format) throws
}
}
}

logger.info("Read " + files.length + " files from " + path);

return res;
}

Expand All @@ -65,13 +71,14 @@ public static NERDocument readFile(String path, String format, String documentNa
NERDocument res = null;
if (format.equals("-c")) {
res = (new ColumnFileReader(path)).read(documentName);
} else if (format.equals("-r")) {
res = BracketFileReader.read(path, documentName);
}else if (format.equals("-json")) {
TextAnnotation ta = SerializationHelper.deserializeTextAnnotationFromFile(path, true);
res = TextAnnotationConverter.getNerDocument(ta);
} else {
if (format.equals("-r")) {
res = BracketFileReader.read(path, documentName);
} else {
System.err.println("Fatal error: unrecognized file format: " + format);
System.exit(0);
}
System.err.println("Fatal error: unrecognized file format: " + format);
System.exit(0);
}
connectSentenceBoundaries(res.sentences);
return res;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
package edu.illinois.cs.cogcomp.ner.ParsingProcessingData;

import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*;
import edu.illinois.cs.cogcomp.lbjava.nlp.Word;
import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector;
import edu.illinois.cs.cogcomp.ner.LbjTagger.Data;
import edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument;
import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

public class TextAnnotationConverter {

private static Logger logger = LoggerFactory.getLogger(TextAnnotationConverter.class);

/**
* NER Code uses the Data object to run. This converts TextAnnotations into a Data object.
* Important: this creates data with BIO labeling.
*
* @param tas list of text annotations
*/
public static Data loaddataFromTAs(List<TextAnnotation> tas) throws Exception {

Data data = new Data();
for(TextAnnotation ta : tas) {
NERDocument doc = getNerDocument(ta);

data.documents.add(doc);
}

return data;
}

/**
* Convert a single TextAnnotation into an NERDocument, for use in a Data object.
* @param ta a text annotation
* @return NERDocument
*/
public static NERDocument getNerDocument(TextAnnotation ta) {
// convert this data structure into one the NER package can deal with.
ArrayList<LinkedVector> sentences = new ArrayList<>();
String[] tokens = ta.getTokens();

View ner;
if(ta.hasView(ViewNames.NER_CONLL)){
ner = ta.getView(ViewNames.NER_CONLL);
}else{
ner = new View(ViewNames.NER_CONLL, "Ltf2TextAnnotation",ta,1.0);
ta.addView(ViewNames.NER_CONLL, ner);
}

int[] tokenindices = new int[tokens.length];
int tokenIndex = 0;
int neWordIndex = 0;
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sentence = ta.getSentence(i);
int sentstart = sentence.getStartSpan();

LinkedVector words = new LinkedVector();

for(int k = 0; k < sentence.size(); k++){
int tokenid = sentstart+k;

String w = sentence.getToken(k);

List<Constituent> cons = ner.getConstituentsCoveringToken(tokenid);
if(cons.size() > 1){
logger.error("Doc: " + ta.getId() + ", Too many constituents for token " + tokenid + ", choosing just the first.");
}

String tag = "O";

if(cons.size() > 0) {
Constituent c = cons.get(0);
if(tokenid == c.getSpan().getFirst())
tag = "B-" + c.getLabel();
else
tag = "I-" + c.getLabel();
}

if (w.length() > 0) {
//NEWord.addTokenToSentence(words, w, tag);

NEWord word=new NEWord(new Word(w),null,tag);

NEWord.addTokenToSentence(words, word);


tokenindices[neWordIndex] = tokenIndex;
neWordIndex++;
} else {
logger.error("Bad (zero length) token.");
}
tokenIndex++;
}
if (words.size() > 0)
sentences.add(words);
}
return new NERDocument(sentences, ta.getId());
}

/**
* Assume data is annotated at this point. This will add an NER view to the TAs.
* @param data
* @param tas
*/
public static void Data2TextAnnotation(Data data, List<TextAnnotation> tas) {

HashMap<String, TextAnnotation> id2ta = new HashMap<>();
for(TextAnnotation ta : tas){
id2ta.put(ta.getId(), ta);
}

for(NERDocument doc : data.documents) {
String docid = doc.docname;

TextAnnotation ta = id2ta.get(docid);
ArrayList<LinkedVector> nerSentences = doc.sentences;
SpanLabelView nerView = new SpanLabelView(ViewNames.NER_CONLL, ta);

// each LinkedVector in data corresponds to a sentence.
int tokenoffset = 0;
for (LinkedVector sentence : nerSentences) {
boolean open = false;

// there should be a 1:1 mapping btw sentence tokens in record and words/predictions
// from NER.
int startIndex = -1;
String label = null;
for (int j = 0; j < sentence.size(); j++, tokenoffset++) {
NEWord neWord = (NEWord) (sentence.get(j));
String prediction = neWord.neTypeLevel2;

// LAM-tlr this is not a great way to ascertain the entity type, it's a bit
// convoluted, and very
// inefficient, use enums, or nominalized indexes for this sort of thing.
if (prediction.startsWith("B-")) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
} else if (j > 0) {
String previous_prediction = ((NEWord) sentence.get(j - 1)).neTypeLevel2;
if (prediction.startsWith("I-")
&& (!previous_prediction.endsWith(prediction.substring(2)))) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
}
}

if (open) {
boolean close = false;
if (j == sentence.size() - 1) {
close = true;
} else {
String next_prediction = ((NEWord) sentence.get(j + 1)).neTypeLevel2;
if (next_prediction.startsWith("B-"))
close = true;
if (next_prediction.equals("O"))
close = true;
if (next_prediction.indexOf('-') > -1
&& (!prediction.endsWith(next_prediction.substring(2))))
close = true;
}
if (close) {
nerView.addSpanLabel(startIndex, tokenoffset+1, label, 1d);
open = false;
}
}
}
}
ta.addView(ViewNames.NER_CONLL, nerView);
}
}

}

0 comments on commit 25d3e70

Please sign in to comment.