dodona-edu · rien · May 31, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/cli/src/cli/commands/run.ts b/cli/src/cli/commands/run.ts
@@ -57,6 +57,13 @@ export function runCommand(program: Command): Command {
       ),
       x => parseFloat(x),
     )
+    .option(
+      "-i, --ignore <path>",
+      Utils.indent(
+        "Path of a file with template/boilerplate code. " +
+        "Code fragments matching with this file will be ignored."
+      )
+    )
     .option(
       "-L, --limit-results <integer>",
       Utils.indent(
@@ -171,6 +178,7 @@ interface RunOptions extends Options {
   host: string;
   outputFormat: string;
   outputDestination: string;
+  ignore: string;
 }
 
 export async function run(locations: string[], options: RunOptions): Promise<void> {
@@ -199,7 +207,7 @@ export async function run(locations: string[], options: RunOptions): Promise<voi
       sortBy: options.sortBy,
       fragmentSortBy: options.fragmentSortBy,
     });
-    const report = await dolos.analyzePaths(locations);
+    const report = await dolos.analyzePaths(locations, options.ignore);
 
     if (report.warnings.length > 0) {
       report.warnings.forEach(warn => warning(warn));

diff --git a/cli/src/cli/views/fileView.ts b/cli/src/cli/views/fileView.ts
@@ -77,6 +77,7 @@ export class FileView extends View {
       {
         "id": s => s.id,
         "hash": s => s.hash,
+        "ignored": s => s.ignored ? "true" : "false",
         "data": s => s.kgram?.join(" ") || null,
         "files": s => JSON.stringify(s.files().map(f => f.id))
       });
@@ -85,9 +86,10 @@ export class FileView extends View {
   public writeFiles(out: Writable): void {
     writeCSVto<FileEntry>(
       out,
-      this.report.entries(),
+      this.report.entries().concat(this.report.ignoredEntries()),
       {
         "id": f => f.file.id,
+        "ignored": f => f.isIgnored ? "true" : "false",
         "path": f => f.file.path,
         "content": f => f.file.content,
         "amountOfKgrams": f => f.kgrams.length,

diff --git a/core/src/algorithm/fingerprintIndex.ts b/core/src/algorithm/fingerprintIndex.ts
@@ -12,8 +12,10 @@ export type Hash = number;
 
 export interface FileEntry {
   file: TokenizedFile;
-  kgrams: Array<Range>,
+  kgrams: Array<Range>;
   shared: Set<SharedFingerprint>;
+  ignored: Set<SharedFingerprint>;
+  isIgnored: boolean;
 }
 
 export interface Occurrence {
@@ -22,9 +24,16 @@ export interface Occurrence {
 }
 
 export class FingerprintIndex {
+  // HashFilter transforms tokens into (a selection of) hashes
   private readonly hashFilter: HashFilter;
+  // A map of file id to FileEntry object that has been analysed
   private readonly files: Map<number, FileEntry>;
+  // A map of file id to FileEntry object that is ignored (e.g. template code)
+  private readonly ignoredFiles: Map<number, FileEntry>;
+  // A map of hashes to their Shared Fingerprints (which keep track of the files they are in)
   private readonly index: Map<Hash, SharedFingerprint>;
+  // A set of ignored hashes (either manually added, or through the ignored files, NOT because of maxFileCount)
+  private readonly ignoredHashes: Set<number>;
 
   /**
    * Creates a Fingerprint Index which is able to compare files with each other
@@ -34,11 +43,48 @@ export class FingerprintIndex {
   constructor(
     private readonly kgramLength: number,
     private readonly kgramsInWindow: number,
-    kgramData?: boolean
+    kgramData?: boolean,
+    private maxFingerprintFileCount = Number.MAX_SAFE_INTEGER,
   ) {
     this.hashFilter = new WinnowFilter(this.kgramLength, this.kgramsInWindow, kgramData);
     this.files = new Map<number, FileEntry>();
+    this.ignoredFiles = new Map<number, FileEntry>();
     this.index = new Map<Hash, SharedFingerprint>();
+    this.ignoredHashes = new Set<number>();
+  }
+
+  public addIgnoredFile(file: TokenizedFile): void {
+    assert(!this.ignoredFiles.has(file.id), `This file has already been ignored: ${file.file.path}`);
+    const entry: FileEntry = {
+      file,
+      kgrams: [],
+      isIgnored: true,
+      shared: new Set<SharedFingerprint>(),
+      ignored: new Set<SharedFingerprint>()
+    };
+
+    this.ignoredFiles.set(file.id, entry);
+    this.addEntry(entry);
+  }
+
+  public getMaxFingerprintFileCount(): number {
+    return this.maxFingerprintFileCount;
+  }
+
+  public updateMaxFingerprintFileCount(maxFingerprintFileCount: number | undefined): void {
+    if (maxFingerprintFileCount == this.maxFingerprintFileCount) {
+      return;
+    }
+    this.maxFingerprintFileCount = maxFingerprintFileCount || Number.MAX_SAFE_INTEGER;
+    for (const shared of this.index.values()) {
+      if (!this.ignoredHashes.has(shared.hash)) {
+        if (shared.fileCount() > this.maxFingerprintFileCount && !shared.ignored) {
+          this.ignoreSharedFingerprint(shared);
+        } else if (shared.fileCount() <= this.maxFingerprintFileCount && shared.ignored) {
+          this.unIgnoreSharedFingerprint(shared);
+        }
+      }
+    }
   }
 
   public addFiles(tokenizedFiles: TokenizedFile[]): Map<Hash, SharedFingerprint> {
@@ -48,69 +94,108 @@ export class FingerprintIndex {
     }
 
     for (const file of tokenizedFiles) {
-      let kgram = 0;
-
       const entry: FileEntry = {
         file,
         kgrams: [],
-        shared: new Set<SharedFingerprint>()
+        isIgnored: false,
+        shared: new Set<SharedFingerprint>(),
+        ignored: new Set<SharedFingerprint>()
       };
 
       this.files.set(file.id, entry);
+      this.addEntry(entry);
+    }
 
-      for (
-        const { data, hash, start, stop  }
-        of this.hashFilter.fingerprints(file.tokens)
-      ) {
-
-        // add kgram to file
-        entry.kgrams.push(new Range(start, stop));
-
-        // sanity check
-        assert(
-          Region.isInOrder(
-            file.mapping[start],
-            file.mapping[stop]
-          )
-            // If we end our kgram on a ')', the location of the opening token is used.
-            // However, the location of this token in the file might be before
-            // the location of the starting token of the kmer
-            // For example: the last token of every ast is ')', closing the program.
-            // The location of this token is always (0, 0), since the program root is the first token.
-            // In this way, the 'end' token is before any other token in the AST.
-            || file.tokens[stop] === ")" ,
-          `Invalid ordering:
-             expected ${file.mapping[start]}
-             to start be before the end of ${file.mapping[stop]}`
-        );
+    return this.index;
+  }
+
+  private addEntry(entry: FileEntry): void {
+    const file = entry.file;
+    let kgram = 0;
+    for (
+      const { data, hash, start, stop  }
+      of this.hashFilter.fingerprints(file.tokens)
+    ) {
 
-        const location = Region.merge(
+      // add kgram to file
+      entry.kgrams.push(new Range(start, stop));
+
+      // sanity check
+      assert(
+        Region.isInOrder(
           file.mapping[start],
           file.mapping[stop]
-        );
+        )
+        // If we end our kgram on a ')', the location of the opening token is used.
+        // However, the location of this token in the file might be before
+        // the location of the starting token of the kmer
+        // For example: the last token of every ast is ')', closing the program.
+        // The location of this token is always (0, 0), since the program root is the first token.
+        // In this way, the 'end' token is before any other token in the AST.
+        || file.tokens[stop] === ")" ,
+        `Invalid ordering:
+             expected ${file.mapping[start]}
+             to start be before the end of ${file.mapping[stop]}`
+      );
 
-        const part: Occurrence = {
-          file,
-          side: { index: kgram, start, stop, data, location }
-        };
+      const location = Region.merge(
+        file.mapping[start],
+        file.mapping[stop]
+      );
 
-        // look if the index already contains the given hashing
-        let shared: SharedFingerprint | undefined = this.index.get(hash);
+      const part: Occurrence = {
+        file,
+        side: { index: kgram, start, stop, data, location }
+      };
 
-        if (!shared) {
-          // if the hashing does not yet exist in the index, add it
-          shared = new SharedFingerprint(hash, data);
-          this.index.set(hash, shared);
-        }
+      // look if the index already contains the given hashing
+      let shared: SharedFingerprint | undefined = this.index.get(hash);
+
+      if (!shared) {
+        // if the hashing does not yet exist in the index, add it
+        shared = new SharedFingerprint(hash, data);
+        this.index.set(hash, shared);
+      }
 
-        shared.add(part);
+      shared.add(part);
+      if (entry.isIgnored || shared.fileCount() > this.maxFingerprintFileCount || this.ignoredHashes.has(hash)) {
+        this.ignoreSharedFingerprint(shared);
+      } else {
         entry.shared.add(shared);
+      }
+
+      kgram += 1;
+    }
+  }
 
-        kgram += 1;
+  public addIgnoredHashes(hashes: Array<Hash>): void {
+    for (const hash of hashes) {
+      this.ignoredHashes.add(hash);
+      const shared = this.index.get(hash);
+      if (shared) {
+        this.ignoreSharedFingerprint(shared);
       }
     }
+  }
 
-    return this.index;
+  private ignoreSharedFingerprint(shared: SharedFingerprint): void {
+    shared.ignored = true;
+    for (const other of shared.files()) {
+      if (!this.ignoredFiles.has(other.id)) {
+        const otherEntry = this.files.get(other.id)!;
+        otherEntry.shared.delete(shared);
+        otherEntry.ignored.add(shared);
+      }
+    }
+  }
+
+  private unIgnoreSharedFingerprint(shared: SharedFingerprint): void {
+    shared.ignored = false;
+    for (const other of shared.files()) {
+      const otherEntry = this.files.get(other.id)!;
+      otherEntry.ignored.delete(shared);
+      otherEntry.shared.add(shared);
+    }
   }
 
   public sharedFingerprints(): Array<SharedFingerprint> {
@@ -121,6 +206,10 @@ export class FingerprintIndex {
     return Array.from(this.files.values());
   }
 
+  public ignoredEntries(): Array<FileEntry> {
+    return Array.from(this.ignoredFiles.values());
+  }
+
   public getPair(file1: TokenizedFile, file2: TokenizedFile): Pair {
     const entry1 = this.files.get(file1.id);
     const entry2 = this.files.get(file2.id);

diff --git a/core/src/algorithm/pair.ts b/core/src/algorithm/pair.ts
@@ -30,6 +30,8 @@ export class Pair extends Identifiable {
   public readonly rightTotal;
   public readonly longest;
   public readonly similarity;
+  public readonly leftIgnored;
+  public readonly rightIgnored;
 
   constructor(
     public readonly leftEntry: FileEntry,
@@ -71,10 +73,13 @@ export class Pair extends Identifiable {
 
     this.leftCovered = left.length;
     this.rightCovered = right.length;
+    this.leftIgnored = leftEntry.ignored.size;
+    this.rightIgnored = leftEntry.ignored.size;
     this.leftTotal = leftEntry.kgrams.length;
     this.rightTotal = rightEntry.kgrams.length;
-    if (this.leftTotal + this.rightTotal > 0) {
-      this.similarity = (this.leftCovered + this.rightCovered) / (this.leftTotal + this.rightTotal);
+    const denominator = this.leftTotal + this.rightTotal - this.leftIgnored - this.rightIgnored;
+    if (denominator > 0) {
+      this.similarity = (this.leftCovered + this.rightCovered) / denominator;
     } else {
       this.similarity = 0;
     }

diff --git a/core/src/algorithm/sharedFingerprint.ts b/core/src/algorithm/sharedFingerprint.ts
@@ -4,6 +4,9 @@ import { Identifiable } from "../util/identifiable.js";
 
 export class SharedFingerprint extends Identifiable {
 
+  // Whether this SharedFingerprint occurs in the boilerplate/template code
+  public ignored: boolean = false;
+
   private partMap: Map<TokenizedFile, Array<Occurrence>> = new Map();
 
   constructor(
@@ -40,4 +43,8 @@ export class SharedFingerprint extends Identifiable {
   public fileCount(): number {
     return this.partMap.size;
   }
+
+  public includesFile(file: TokenizedFile): boolean {
+    return this.partMap.has(file);
+  }
 }
diff --git a/core/src/test/pair.test.ts b/core/src/test/pair.test.ts
@@ -54,13 +54,17 @@ test("paired occurrence merging & squashing", t => {
   const left = {
     kgrams: new Array<Range>(),
     shared: new Set<SharedFingerprint>(),
-    file: leftFile
+    ignored: new Set<SharedFingerprint>(),
+    file: leftFile,
+    isIgnored: false,
   };
 
   const right = {
     kgrams: new Array<Range>(),
     shared: new Set<SharedFingerprint>(),
-    file: rightFile
+    ignored: new Set<SharedFingerprint>(),
+    file: rightFile,
+    isIgnored: false,
   };
 
 

diff --git a/docs/docs/running.md b/docs/docs/running.md
@@ -49,6 +49,28 @@ You can show all command line options by passing the `-h` or `--help` flag or by
 You can improve the plagiarism detection report by adding metadata to your submissions (submission time, labels, author name, ...).
 See the page about [adding metadata](/docs/adding-metadata) to see how.
 
+## Ignoring template code
+
+Programming exercises often have code in common that is not plagiarised. For example: class and method definitions, given test cases, boilerplate code, ...
+Dolos will often detect these code fragments as similar and include them in the similarity score, making it harder to spot actual plagiarism.
+
+With the `-i <path>` or `--ignore <path>` parameter, you can add an _ignore_ file (often also called a _template_ or _boilerplate_) to the analysis.
+Code fragments from analysed solutions that match with this file will be ignored and these fingerprints will not count towards similarity.
+
+In addition, it is also possible to **automatically detect** common code.
+By passing `-m <integer>` or `--max-fingerprint-count <integer>` you can specify a maximum number of files a code fragment can occur in before it is ignored.
+With `-M <fraction>` or `--max-fingerprint-percentage <fraction>` it is possible to specify this number as a fraction (percentage) of the total analysed file count.
+It is possible to combine this with specifying an ignore file with the `-i` option.
+
+
+Example usage:
+
+```sh
+# Ignore all code fragments occurring in more than half of the files,
+# or occurring in template.js
+dolos run -M 0.5 -i template.js solutions/*.js 
+```
+
 ## Modifying plagiarism detection parameters
 
 The plagiarism detection parameters can be altered by passing the relevant arguments when running Dolos.