Merge pull request #676 from mayhewsw/taformat

Added the ability to use JSON Serialized Format with NerTagger
CogComp · Aug 2, 2018 · 25d3e70 · 25d3e70
2 parents 267fdcb + 6f0768d
commit 25d3e70
Show file tree

Hide file tree

Showing 6 changed files with 261 additions and 24 deletions.
diff --git a/ner/README.md b/ner/README.md
@@ -240,8 +240,9 @@ Where the parameters are:
     - this file is used for parameter tuning of the training, use the training file if you don't have a development set (use the same file both for training and for development)
 - files-format can be either:
     - -c (for column format) or
-    - -r (for brackets format.
-    - See below for more information on the formats). Both the training and the development files have to be in the same format.
+    - -r (for brackets format)
+    - -json (for JSON-Serialized [TextAnnotation](https://github.com/CogComp/cogcomp-nlp/blob/master/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/TextAnnotation.java) format (see [SerializationHelper](https://github.com/CogComp/cogcomp-nlp/blob/master/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/utilities/SerializationHelper.java) for more details)
+    - See below for more information on the formats. Both the training and the development files have to be in the same format.
 
 Complete, working example. Before running this, open [`config/ner.properties`](config/ner.properties) and change the `pathToModelFile` to
 something else (for example, `ner/mymodels`). This will prevent it from attempting to overwrite the jar.

diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java
@@ -12,7 +12,6 @@
 import edu.illinois.cs.cogcomp.lbjava.learn.BatchTrainer;
 import edu.illinois.cs.cogcomp.lbjava.learn.SparseAveragedPerceptron;
 import edu.illinois.cs.cogcomp.lbjava.learn.SparseNetworkLearner;
-import edu.illinois.cs.cogcomp.lbjava.learn.featurepruning.SparseNetworkOptimizer;
 import edu.illinois.cs.cogcomp.lbjava.parse.Parser;
 import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.ExpressiveFeaturesAnnotator;
 import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.TwoLayerPredictionAggregationFeatures;
@@ -59,20 +58,36 @@ public static void buildFinalModel(int fixedNumIterations, String trainDataPath,
         getLearningCurve(train, test, fixedNumIterations);
     }
 
+    /**
+     * Convenience function that has a default value for dataFormat of -c
+     * @param fixedNumIterations
+     * @param trainDataPath
+     * @param testDataPath
+     * @throws Exception
+     */
+    public static void getLearningCurve(int fixedNumIterations, String trainDataPath,
+                                        String testDataPath) throws Exception {
+        getLearningCurve(fixedNumIterations, trainDataPath, "-c", testDataPath);
+    }
+
     /**
      * train a model with the specified inputs, evaluate with the specified test data
      * <p>
-     * use fixedNumIterations=-1 if you want to use the automatic convergence criterion
+     *     Use fixedNumIterations=-1 if you want to use the automatic convergence criterion
+     * </p>
+     * <p>
+     *     In practice, testDataPath should be a Development set.
+     * </p>
      */
-    public static void getLearningCurve(int fixedNumIterations, String trainDataPath,
-            String testDataPath) throws Exception {
+    public static void getLearningCurve(int fixedNumIterations, String dataFormat, String trainDataPath,
+                                        String testDataPath) throws Exception {
         logger.debug("getLearningCurve(): fni = " + fixedNumIterations + " trainDataPath = '"
                 + trainDataPath + "' testDataPath = '" + testDataPath + "'....");
         Data trainData =
-                new Data(trainDataPath, trainDataPath, "-c", new String[] {}, new String[] {});
+                new Data(trainDataPath, trainDataPath, dataFormat, new String[] {}, new String[] {});
         ExpressiveFeaturesAnnotator.annotate(trainData);
         Data testData =
-                new Data(testDataPath, testDataPath, "-c", new String[] {}, new String[] {});
+                new Data(testDataPath, testDataPath, dataFormat, new String[] {}, new String[] {});
         ExpressiveFeaturesAnnotator.annotate(testData);
         Vector<Data> train = new Vector<>();
         train.addElement(trainData);
@@ -154,6 +169,7 @@ public static void getLearningCurve(Vector<Data> trainDataSet, Vector<Data> test
                 TestDiscrete simpleTest = new TestDiscrete();
                 simpleTest.addNull("O");
                 TestDiscrete.testDiscrete(simpleTest, tagger1, null, testParser1, true, 0);
+
                 double f1Level1 = simpleTest.getOverallStats()[2];
                 if (f1Level1 > bestF1Level1) {
                     bestF1Level1 = f1Level1;
@@ -249,6 +265,7 @@ public static void getLearningCurve(Vector<Data> trainDataSet, Vector<Data> test
                     + "\t Level2: bestround=" + bestRoundLevel2 + "\t F1=" + bestF1Level2);
         }
 
+        NETesterMultiDataset.printTestResultsByDataset(testDataSet, tagger1, tagger2, true);
 
         /*
          * This will override the models forcing to save the iteration we're interested in- the

diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETesterMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NETesterMultiDataset.java
@@ -31,10 +31,25 @@ public class NETesterMultiDataset {
      * NB: assuming column format
      */
     public static void test(String testDatapath, boolean verbose,
-            Vector<String> labelsToIgnoreInEvaluation, Vector<String> labelsToAnonymizeInEvaluation)
+                            Vector<String> labelsToIgnoreInEvaluation, Vector<String> labelsToAnonymizeInEvaluation)
+            throws Exception {
+        test(testDatapath,verbose, "-c", labelsToIgnoreInEvaluation, labelsToAnonymizeInEvaluation);
+    }
+
+    /**
+     * Allows format to be specified.
+     * @param testDatapath
+     * @param verbose
+     * @param dataFormat
+     * @param labelsToIgnoreInEvaluation
+     * @param labelsToAnonymizeInEvaluation
+     * @throws Exception
+     */
+    public static void test(String testDatapath, boolean verbose, String dataFormat,
+                            Vector<String> labelsToIgnoreInEvaluation, Vector<String> labelsToAnonymizeInEvaluation)
             throws Exception {
         Data testData =
-                new Data(testDatapath, testDatapath, "-c", new String[] {}, new String[] {});
+                new Data(testDatapath, testDatapath, dataFormat, new String[] {}, new String[] {});
         ExpressiveFeaturesAnnotator.annotate(testData);
         Vector<Data> data = new Vector<>();
         data.addElement(testData);

diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerTagger.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NerTagger.java
@@ -37,9 +37,16 @@ public static void main(String[] args) {
             ResourceManager rm = new ResourceManager(args[args.length - 1]);
             Parameters.readConfigAndLoadExternalData(args[args.length - 1], areWeTraining);
 
-            if (args[0].equalsIgnoreCase("-train"))
-                LearningCurveMultiDataset.getLearningCurve(-1, args[1], args[2]);
-            else if (args[0].equalsIgnoreCase("-trainFixedIterations"))
+            if (args[0].equalsIgnoreCase("-train")) {
+                String dataFormat;
+                // config file is always the last one.
+                if(args.length < 5){
+                    dataFormat = "-c";
+                }else{
+                    dataFormat = args[3];
+                }
+                LearningCurveMultiDataset.getLearningCurve(-1, dataFormat, args[1], args[2]);
+            }else if (args[0].equalsIgnoreCase("-trainFixedIterations"))
                 LearningCurveMultiDataset.getLearningCurve(Integer.parseInt(args[1]), args[2],
                         args[3]);
             else {
@@ -68,10 +75,17 @@ else if (args[0].equalsIgnoreCase("-trainFixedIterations"))
                         logger.info(output.toString());
                     }
                 }
-                if (args[0].equalsIgnoreCase("-test"))
-                    NETesterMultiDataset.test(args[1], false, cp.labelsToIgnoreInEvaluation,
+                if (args[0].equalsIgnoreCase("-test")) {
+                    String dataFormat;
+                    // config file is always the last one.
+                    if(args.length < 4){
+                        dataFormat = "-c";
+                    }else{
+                        dataFormat = args[2];
+                    }
+                    NETesterMultiDataset.test(args[1], true, dataFormat, cp.labelsToIgnoreInEvaluation,
                             cp.labelsToAnonymizeInEvaluation);
-                if (args[0].equalsIgnoreCase("-dumpFeatures"))
+                }if (args[0].equalsIgnoreCase("-dumpFeatures"))
                     NETesterMultiDataset.dumpFeaturesLabeledData(args[1], args[2]);
             }
         } catch (Exception e) {
@@ -87,9 +101,10 @@ private static void printUsage(PrintStream out) {
                 "Usage: edu.illinois.cs.cogcomp.ner.NerTagger <command> [options] <config-file>\n";
         usage +=
                 "commands:\n" + "\t-demo\n" + "\t-annotate <input-dir> <output-dir>\n"
-                        + "\t-train <train-dir> <test-dir>\n"
+                        + "\t-train <train-dir> <test-dir> <dataformat = {-c, -r, -json}, -c by default>\n"
                         + "\t-trainFixedIterations <num-iters> <train-dir> <test-dir>\n"
-                        + "\t-test <test-dir>\n" + "\t-dumpFeatures <test-dir> <output-dir>";
+                        + "\t-test <test-dir> <dataformat = {-c, -r, -json}, -c by default>\n"
+                        + "\t-dumpFeatures <test-dir> <output-dir>";
         out.println(usage);
     }
 }
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TaggedDataReader.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TaggedDataReader.java
@@ -7,6 +7,8 @@
  */
 package edu.illinois.cs.cogcomp.ner.ParsingProcessingData;
 
+import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
+import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper;
 import edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument;
 import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord;
 import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode;
@@ -15,6 +17,7 @@
 import org.slf4j.LoggerFactory;
 
 import java.io.File;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Vector;
@@ -57,6 +60,9 @@ public static Vector<NERDocument> readFolder(String path, String format) throws
                 }
             }
         }
+
+        logger.info("Read " + files.length + " files from " + path);
+
         return res;
     }
 
@@ -65,13 +71,14 @@ public static NERDocument readFile(String path, String format, String documentNa
         NERDocument res = null;
         if (format.equals("-c")) {
             res = (new ColumnFileReader(path)).read(documentName);
+        } else if (format.equals("-r")) {
+            res = BracketFileReader.read(path, documentName);
+        }else if (format.equals("-json")) {
+            TextAnnotation ta = SerializationHelper.deserializeTextAnnotationFromFile(path, true);
+            res = TextAnnotationConverter.getNerDocument(ta);
         } else {
-            if (format.equals("-r")) {
-                res = BracketFileReader.read(path, documentName);
-            } else {
-                System.err.println("Fatal error: unrecognized file format: " + format);
-                System.exit(0);
-            }
+            System.err.println("Fatal error: unrecognized file format: " + format);
+            System.exit(0);
         }
         connectSentenceBoundaries(res.sentences);
         return res;

diff --git a/.../main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TextAnnotationConverter.java b/.../main/java/edu/illinois/cs/cogcomp/ner/ParsingProcessingData/TextAnnotationConverter.java
@@ -0,0 +1,182 @@
+package edu.illinois.cs.cogcomp.ner.ParsingProcessingData;
+
+import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
+import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*;
+import edu.illinois.cs.cogcomp.lbjava.nlp.Word;
+import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector;
+import edu.illinois.cs.cogcomp.ner.LbjTagger.Data;
+import edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument;
+import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord;
+import org.jetbrains.annotations.NotNull;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+public class TextAnnotationConverter {
+
+    private static Logger logger = LoggerFactory.getLogger(TextAnnotationConverter.class);
+
+    /**
+     * NER Code uses the Data object to run. This converts TextAnnotations into a Data object.
+     * Important: this creates data with BIO labeling.
+     *
+     * @param tas list of text annotations
+     */
+    public static Data loaddataFromTAs(List<TextAnnotation> tas) throws Exception {
+
+        Data data = new Data();
+        for(TextAnnotation ta : tas) {
+            NERDocument doc = getNerDocument(ta);
+
+            data.documents.add(doc);
+        }
+
+        return data;
+    }
+
+    /**
+     * Convert a single TextAnnotation into an NERDocument, for use in a Data object.
+     * @param ta a text annotation
+     * @return NERDocument
+     */
+    public static NERDocument getNerDocument(TextAnnotation ta) {
+        // convert this data structure into one the NER package can deal with.
+        ArrayList<LinkedVector> sentences = new ArrayList<>();
+        String[] tokens = ta.getTokens();
+
+        View ner;
+        if(ta.hasView(ViewNames.NER_CONLL)){
+            ner = ta.getView(ViewNames.NER_CONLL);
+        }else{
+            ner = new View(ViewNames.NER_CONLL, "Ltf2TextAnnotation",ta,1.0);
+            ta.addView(ViewNames.NER_CONLL, ner);
+        }
+
+        int[] tokenindices = new int[tokens.length];
+        int tokenIndex = 0;
+        int neWordIndex = 0;
+        for (int i = 0; i < ta.getNumberOfSentences(); i++) {
+            Sentence sentence = ta.getSentence(i);
+            int sentstart = sentence.getStartSpan();
+
+            LinkedVector words = new LinkedVector();
+
+            for(int k = 0; k < sentence.size(); k++){
+                int tokenid = sentstart+k;
+
+                String w = sentence.getToken(k);
+
+                List<Constituent> cons = ner.getConstituentsCoveringToken(tokenid);
+                if(cons.size() > 1){
+                    logger.error("Doc: " + ta.getId() + ", Too many constituents for token " + tokenid + ", choosing just the first.");
+                }
+
+                String tag = "O";
+
+                if(cons.size() > 0) {
+                    Constituent c = cons.get(0);
+                    if(tokenid == c.getSpan().getFirst())
+                        tag = "B-" + c.getLabel();
+                    else
+                        tag = "I-" + c.getLabel();
+                }
+
+                if (w.length() > 0) {
+                    //NEWord.addTokenToSentence(words, w, tag);
+
+                    NEWord word=new NEWord(new Word(w),null,tag);
+
+                    NEWord.addTokenToSentence(words, word);
+
+
+                    tokenindices[neWordIndex] = tokenIndex;
+                    neWordIndex++;
+                } else {
+                    logger.error("Bad (zero length) token.");
+                }
+                tokenIndex++;
+            }
+            if (words.size() > 0)
+                sentences.add(words);
+        }
+        return new NERDocument(sentences, ta.getId());
+    }
+
+    /**
+     * Assume data is annotated at this point. This will add an NER view to the TAs.
+     * @param data
+     * @param tas
+     */
+    public static void Data2TextAnnotation(Data data, List<TextAnnotation> tas) {
+
+        HashMap<String, TextAnnotation> id2ta = new HashMap<>();
+        for(TextAnnotation ta : tas){
+            id2ta.put(ta.getId(), ta);
+        }
+
+        for(NERDocument doc : data.documents) {
+            String docid = doc.docname;
+
+            TextAnnotation ta = id2ta.get(docid);
+            ArrayList<LinkedVector> nerSentences = doc.sentences;
+            SpanLabelView nerView = new SpanLabelView(ViewNames.NER_CONLL, ta);
+
+            // each LinkedVector in data corresponds to a sentence.
+            int tokenoffset = 0;
+            for (LinkedVector sentence : nerSentences) {
+                boolean open = false;
+
+                // there should be a 1:1 mapping btw sentence tokens in record and words/predictions
+                // from NER.
+                int startIndex = -1;
+                String label = null;
+                for (int j = 0; j < sentence.size(); j++, tokenoffset++) {
+                    NEWord neWord = (NEWord) (sentence.get(j));
+                    String prediction = neWord.neTypeLevel2;
+
+                    // LAM-tlr this is not a great way to ascertain the entity type, it's a bit
+                    // convoluted, and very
+                    // inefficient, use enums, or nominalized indexes for this sort of thing.
+                    if (prediction.startsWith("B-")) {
+                        startIndex = tokenoffset;
+                        label = prediction.substring(2);
+                        open = true;
+                    } else if (j > 0) {
+                        String previous_prediction = ((NEWord) sentence.get(j - 1)).neTypeLevel2;
+                        if (prediction.startsWith("I-")
+                                && (!previous_prediction.endsWith(prediction.substring(2)))) {
+                            startIndex = tokenoffset;
+                            label = prediction.substring(2);
+                            open = true;
+                        }
+                    }
+
+                    if (open) {
+                        boolean close = false;
+                        if (j == sentence.size() - 1) {
+                            close = true;
+                        } else {
+                            String next_prediction = ((NEWord) sentence.get(j + 1)).neTypeLevel2;
+                            if (next_prediction.startsWith("B-"))
+                                close = true;
+                            if (next_prediction.equals("O"))
+                                close = true;
+                            if (next_prediction.indexOf('-') > -1
+                                    && (!prediction.endsWith(next_prediction.substring(2))))
+                                close = true;
+                        }
+                        if (close) {
+                            nerView.addSpanLabel(startIndex, tokenoffset+1, label, 1d);
+                            open = false;
+                        }
+                    }
+                }
+            }
+            ta.addView(ViewNames.NER_CONLL, nerView);
+        }
+    }
+
+}