Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nl] improve CompoundAcceptor #9549

Merged
merged 1 commit into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,12 @@
public class CompoundAcceptor {

// compound parts that need an 's' appended to be used as first part of the compound:
// "teit", "ing", "heid", "schap", "ker"
private final Set<String> needsS1 = new HashSet<>(Arrays.asList(
"bedrijfs", "passagiers" //, "dorps", "gezichts", "lijdens", "besturings", "verbrandings", "bestemmings", "schoonheids"
private final Set<String> needsS = new HashSet<>(Arrays.asList(
"bedrijfs", "passagiers", "dorps", "gezichts", "lijdens", "besturings", "verbrandings", "bestemmings", "schoonheids"
));
// compound parts that must not have an 's' appended to be used as first part of the compound:
private final Set<String> noS1 = new HashSet<>(Arrays.asList(
"sport", "woning" //, "kinder", "fractie", "zout", "schade", "energie", "gemeente", "dienst", "wereld", "telefoon", "winkel", "aandeel", "zwanger", "papier"
private final Set<String> noS = new HashSet<>(Arrays.asList(
"woning", "kinder", "fractie", "schade", "energie", "gemeente", "dienst", "wereld", "telefoon", "aandeel", "zwanger", "papier"
));

private final MorfologikDutchSpellerRule speller;
Expand Down Expand Up @@ -79,11 +78,11 @@ boolean acceptCompound(String part1, String part2) throws IOException {
boolean okWithDash = false;
boolean okWithoutS = false;
if (part1.endsWith("s")) {
okWithS = spellingOk(part1.replaceFirst("s$", "")) && spellingOk(part2) && needsS1.contains(part1);
okWithS = spellingOk(part1.replaceFirst("s$", "")) && spellingOk(part2) && needsS.contains(part1.toLowerCase());
} else if ( part1.endsWith("-")) {
okWithDash = abbrevOk(part1) && spellingOk(part2);
} else {
okWithoutS = spellingOk(part1) && spellingOk(part2) && noS1.contains(part1);
okWithoutS = spellingOk(part1) && spellingOk(part2) && noS.contains(part1.toLowerCase());
}
//System.out.println(" okWithS: " + okWithS + ", okWithoutS " + okWithoutS);
return okWithS || okWithDash || okWithoutS;
Expand All @@ -96,7 +95,7 @@ private boolean abbrevOk(String nonCompound) {

private boolean spellingOk(String nonCompound) throws IOException {
AnalyzedSentence as = new AnalyzedSentence(new AnalyzedTokenReadings[] {
new AnalyzedTokenReadings(new AnalyzedToken(nonCompound, "FAKE_POS", "fakeLemma"))
new AnalyzedTokenReadings(new AnalyzedToken(nonCompound.toLowerCase(), "FAKE_POS", "fakeLemma"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After having merged the PR, I wonder whether this change is ok: aren't there words like country names or names in general that are always uppercase and that won't be accepted when lowercased? These are probably less likely to appear as part of a compound, but maybe you can add a test case for them anyway?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there are some exceptions that will still have uppercases. Like you said, less likely, but I'll think of a better solution for them as I add more words.

});
RuleMatch[] matches = speller.match(as);
return matches.length == 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,25 +37,24 @@ public void testAcceptCompound() throws IOException {
assertTrue(acceptor.acceptCompound("VRF-regels"));
assertFalse(acceptor.acceptCompound("VRFregels"));

//assertTrue(acceptor.acceptCompound("winkeldiefstal"));
//assertFalse(acceptor.acceptCompound("winkelsdiefstal"));

assertTrue(acceptor.acceptCompound("bedrijfsregels"));
assertFalse(acceptor.acceptCompound("bedrijfregels"));

//assertTrue(acceptor.acceptCompound("zwangerschap"));
//assertFalse(acceptor.acceptCompound("zwangersschap"));
assertTrue(acceptor.acceptCompound("Bedrijfsbrommer"));
assertFalse(acceptor.acceptCompound("Bedrijfbrommer"));

assertTrue(acceptor.acceptCompound("zwangerschap"));
assertFalse(acceptor.acceptCompound("zwangersschap"));

//assertTrue(acceptor.acceptCompound("papierversnipperaar"));
//assertFalse(acceptor.acceptCompound("papiersversnipperaar"));
assertTrue(acceptor.acceptCompound("Papierversnipperaar"));
assertFalse(acceptor.acceptCompound("Papiersversnipperaar"));
}

@Test
public void testAcceptCompoundInternal() throws IOException {
CompoundAcceptor acceptor = new CompoundAcceptor();
assertTrue(acceptor.acceptCompound("passagiers", "schip"));
//assertTrue(acceptor.acceptCompound("papier", "versnipperaar"));
//assertTrue(acceptor.acceptCompound("winkel", "diefstal"));
assertTrue(acceptor.acceptCompound("papier", "versnipperaar"));
}

}