From d14ba19babb09e4e73e36f397f3991fd154a56fd Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 26 Oct 2024 13:30:59 -0400 Subject: [PATCH 01/11] fix for \b regexp with JDK>=19 --- .../rules/patterns/PatternRuleHandler.java | 3 +- .../patterns/RegexAntiPatternFilter.java | 2 +- .../rules/patterns/XMLRuleHandler.java | 12 ++- .../org/languagetool/resource/segment.srx | 91 ++++++++++--------- 4 files changed, 58 insertions(+), 50 deletions(-) diff --git a/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleHandler.java b/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleHandler.java index 2bd150784472..04f43cf8d5c1 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleHandler.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleHandler.java @@ -776,7 +776,8 @@ private void createRules(List elemList, rule.setDistanceTokens(distanceTokens); rule.setXmlLineNumber(xmlLineNumber); } else if (regex.length() > 0) { - int flags = regexCaseSensitive ? 0 : Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE; +// int flags = regexCaseSensitive ? 0 : Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE; + int flags = regexCaseSensitive ? Pattern.UNICODE_CHARACTER_CLASS : Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CHARACTER_CLASS; String regexStr = regex.toString(); if (regexMode == RegexpMode.SMART) { // Note: it's not that easy to add \b because the regex might look like '(foo)' or '\d' so we cannot just look at the last character diff --git a/languagetool-core/src/main/java/org/languagetool/rules/patterns/RegexAntiPatternFilter.java b/languagetool-core/src/main/java/org/languagetool/rules/patterns/RegexAntiPatternFilter.java index d7e7592e8500..06c4910344d6 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/patterns/RegexAntiPatternFilter.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/patterns/RegexAntiPatternFilter.java @@ -42,7 +42,7 @@ public RuleMatch acceptRuleMatch(RuleMatch match, Map arguments, } String[] antiPatterns = antiPatternStr.split("\\|"); for (String antiPattern : antiPatterns) { - Pattern p = Pattern.compile(antiPattern); + Pattern p = Pattern.compile(antiPattern, Pattern.UNICODE_CHARACTER_CLASS); Matcher matcher = p.matcher(sentenceObj.getText()); while (matcher.find()) { // partial overlap is enough to filter out a match: diff --git a/languagetool-core/src/main/java/org/languagetool/rules/patterns/XMLRuleHandler.java b/languagetool-core/src/main/java/org/languagetool/rules/patterns/XMLRuleHandler.java index 2dc66dbbf6c8..9a0fa22038a6 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/patterns/XMLRuleHandler.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/patterns/XMLRuleHandler.java @@ -18,11 +18,18 @@ */ package org.languagetool.rules.patterns; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.ResourceBundle; +import java.util.function.Function; + import org.apache.commons.lang3.ObjectUtils; import org.apache.commons.lang3.tuple.Triple; import org.jetbrains.annotations.Nullable; import org.languagetool.Language; -import org.languagetool.ResourceBundleTools; import org.languagetool.chunking.ChunkTag; import org.languagetool.rules.CorrectExample; import org.languagetool.rules.ErrorTriggeringExample; @@ -35,9 +42,6 @@ import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; -import java.util.*; -import java.util.function.Function; - /** * XML rule handler that loads rules from XML and throws * exceptions on errors and warnings. diff --git a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx index f7752c40e259..0836497e4358 100644 --- a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx +++ b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx @@ -4424,6 +4424,7 @@ \p{Lu}\p{Ll} + \b\d+\.\s @@ -4475,16 +4476,16 @@ \p{Ll} -\b[0-9]+(руб|Руб|тыс|Тыс|трлн|млн|млрд)\.\s +(?U)\b[0-9]+(руб|Руб|тыс|Тыс|трлн|млн|млрд)\.\s \b[0-9]+ -\b(бульв|г|д|доп|др|е|зам|Зам|и|им|инд|исп|Исп)\.\s +(?U)\b(бульв|г|д|доп|др|е|зам|Зам|и|им|инд|исп|Исп)\.\s -\b(англ|в|вв|га|гг|гл|гос|грн|дм|долл|е|ед)\.\s +(?U)\b(англ|в|вв|га|гг|гл|гос|грн|дм|долл|е|ед)\.\s \p{Ll} @@ -4492,7 +4493,7 @@ -\b(кг|км|коп|л|лл|м|мг|мин|мл|млн|Млн|млрд|Млрд|мм)\.\s +(?U)\b(кг|км|коп|л|лл|м|мг|мин|мл|млн|Млн|млрд|Млрд|мм)\.\s \p{Ll} @@ -4504,7 +4505,7 @@ \p{Ll} -\b(руб|Руб|тыс|Тыс|трлн)\.\s +(?U)\b(руб|Руб|тыс|Тыс|трлн)\.\s \p{Ll} @@ -4512,7 +4513,7 @@ -\b(ч|чел|шт|экз)\.\s +(?U)\b(ч|чел|шт|экз)\.\s \p{Ll} @@ -4748,6 +4749,7 @@ \p{Lu}\p{Ll} + Yahoo![\s\u00A0] @@ -4793,43 +4795,43 @@ -\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] +(?U)\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] -\b(dc|inst|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] +(?U)\b(dc|inst|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] -\b(s|ca)\.[\s\u00A0] -[XIV]+\b +(?U)\b(s|ca)\.[\s\u00A0] +(?U)[XIV]+\b -\b(min|m|ca)\.[\s\u00A0] -[0-9]+\b +(?U)\b(min|m|ca)\.[\s\u00A0] +(?U)[0-9]+\b -\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0] -[XIV\d]+\b +(?U)\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0] +(?U)[XIV\d]+\b -\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] [\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll} -\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -\b\p{Lu}{2}\.[\s\u00A0]? +(?U)\b\p{Lu}{2}\.[\s\u00A0]? \p{Lu}{2} @@ -4838,17 +4840,17 @@ -\b([Ee]tc|m[aáà]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b([Ee]tc|m[aáà]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] -\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} @@ -4858,12 +4860,12 @@ -\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] +(?U)\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] \p{Ll} @@ -4872,7 +4874,7 @@ -\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] +(?U)\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] [¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}* @@ -5047,16 +5049,16 @@ -\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. -[A-Za-z0-9\-]+(\.|\b) +(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +(?U)[A-Za-z0-9\-]+(\.|\b) -\b[Se]even\. -[Oo]nes?\b +(?U)\b[Se]even\. +(?U)[Oo]nes?\b -\b[A-Za-z0-9\-]+\. -[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) +(?U)\b[A-Za-z0-9\-]+\. +(?U)[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) @@ -5106,12 +5108,12 @@ Klässler[sn]? -\bP[Hh]\. +(?U)\bP[Hh]\. D\. -\b\p{L}\. +(?U)\b\p{L}\. -\b\d+\.[\u00A0\s]{1,2} +(?U)\b\d+\.[\u00A0\s]{1,2} \p{Ll}|\p{Lu}{2,} @@ -5193,45 +5195,45 @@ -\b(Mrs?|No|pp|St|no|Sr|Jr|[Ss]ek|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]{1,2} +(?U)\b(Mrs?|No|pp|St|no|Sr|Jr|[Ss]ek|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]{1,2} -\b(spp?)\.[\u00A0\s]{1,2} +(?U)\b(spp?)\.[\u00A0\s]{1,2} -\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2} +(?U)\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2} -\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|[Dd]t|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2} +(?U)\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|[Dd]t|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2} -\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]{1,2} +(?U)\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]{1,2} -\b[BM]\.[\u00A0\s]Sc\.[\u00A0\s] +(?U)\b[BM]\.[\u00A0\s]Sc\.[\u00A0\s] \p{Ll} -\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2} +(?U)\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2} -\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2} +(?U)\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2} -\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|[Gg]em|Pat|prov|Betr|lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mio|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]{1,2} +(?U)\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|[Gg]em|Pat|prov|Betr|lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mio|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]{1,2} -\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|[Ss]td?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]{1,2} +(?U)\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|[Ss]td?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]{1,2} @@ -5243,7 +5245,7 @@ [\-–][\u00A0\s]\d+ -\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|voraussichtl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2} +(?U)\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|voraussichtl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2} @@ -5265,6 +5267,7 @@ [“„] + From de7711489cf626923cc8e7188eaeeb5f78794fdb Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 26 Oct 2024 12:37:39 -0400 Subject: [PATCH 02/11] [ro] fix \b regex for JDK>=19 --- .../org/languagetool/resource/segment.srx | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx index 0836497e4358..934caec4a488 100644 --- a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx +++ b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx @@ -1459,7 +1459,7 @@ -\b\d+\.\s +(?U)\b\d+\.\s \p{Ll}|\p{Lu}{2,} @@ -1471,11 +1471,11 @@ -\b\p{L}\.\s +(?U)\b\p{L}\.\s \p{L}\.\s -\b\p{L}\. +(?U)\b\p{L}\. \p{L}\. @@ -1495,39 +1495,39 @@ [^\p{Lu}] -\b(etc|șamd)\.\s +(?U)\b(etc|șamd)\.\s [A-Z] -\b(pag|leg|art)\.\s +(?U)\b(pag|leg|art)\.\s -\b(ian|febr?|mart?|apr|iu[nl]|aug|sept?|oct|nov|dec)\.\s +(?U)\b(ian|febr?|mart?|apr|iu[nl]|aug|sept?|oct|nov|dec)\.\s [^\p{Lu}] -\bdpdv\.\s +(?U)\bdpdv\.\s -\b(etc|șamd)\.\s +(?U)\b(etc|șamd)\.\s -\b(M)\. +(?U)\b(M)\. Ap\.N\.\s -\b(M)\.Ap\. +(?U)\b(M)\.Ap\. N\.\s -\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s +(?U)\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s -\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s[A-Z]\.\s +(?U)\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s[A-Z]\.\s From bf95c0b00848f5008d9eca3a2a122a804c81d703 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 26 Oct 2024 12:50:02 -0400 Subject: [PATCH 03/11] [pt] fix \b regex for JDK>=19 --- .../org/languagetool/resource/segment.srx | 74 +++++++++---------- .../resource/pt/entities/hyphenised.ent | 2 +- .../org/languagetool/resource/pt/pt.sor | 2 +- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx index 934caec4a488..66b535d20898 100644 --- a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx +++ b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx @@ -6238,129 +6238,129 @@ -\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. [A-Za-z0-9\-]+(\.|\b) -\b[A-Za-z0-9\-]+\. +(?U)\b[A-Za-z0-9\-]+\. [A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) -\b(a|Ab|abr|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|ago|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s? +(?U)\b(a|Ab|abr|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|ago|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s? -\b(Bot|barb|B\.el|Bibl|Biol|Bioquím|burl)\.\s? +(?U)\b(Bot|barb|B\.el|Bibl|Biol|Bioquím|burl)\.\s? -\b(ca|card|cat|caus|cf|cit|cód|comp|compar|conj|contr|coord|cop)\.\s? +(?U)\b(ca|card|cat|caus|cf|cit|cód|comp|compar|conj|contr|coord|cop)\.\s? -\b(D|def|dem|deprec|deriv|det|dez|disj|[Dd]ra?s?)\.\s? +(?U)\b(D|def|dem|deprec|deriv|det|dez|disj|[Dd]ra?s?)\.\s? -\b(Ecol|Econ|ed|elem|Eng|erud|estrang|ex|Ex)\.\s? +(?U)\b(Ecol|Econ|ed|elem|Eng|erud|estrang|ex|Ex)\.\s? -\b(etc)\.\s? +(?U)\b(etc)\.\s? \p{Ll} -\b(f|fam|Farm|fem|fev|fig|fin|fl|fr|frac)\.\s? +(?U)\b(f|fam|Farm|fem|fev|fig|fin|fl|fr|frac)\.\s? -\b(gén|geog|Geogr|Geol|Geom|gír|gloss|Gram)\.\s? +(?U)\b(gén|geog|Geogr|Geol|Geom|gír|gloss|Gram)\.\s? -\b(hab|hist|Hort)\.\s? +(?U)\b(hab|hist|Hort)\.\s? -\b(Ibid|id|i.e|incompat|indef|inf|infant|Inform|integr|interj|interr|intr|inv)\.\s? +(?U)\b(Ibid|id|i.e|incompat|indef|inf|infant|Inform|integr|interj|interr|intr|inv)\.\s? -\b(jan|jul|jun|Jorn|Jur)\.\s? +(?U)\b(jan|jul|jun|Jorn|Jur)\.\s? -\b(lat|Lat|Lda|Ling|Lit|liv|loc|log|Lóg|long)\.\s? +(?U)\b(lat|Lat|Lda|Ling|Lit|liv|loc|log|Lóg|long)\.\s? -\b(m|mai|mar|masc|Mat|máx|Mecân|[Mm]ed|Mil|mín|mult|Mús)\.\s? +(?U)\b(m|mai|mar|masc|Mat|máx|Mecân|[Mm]ed|Mil|mín|mult|Mús)\.\s? -\b(n|N|Náut|N.B|neg|neol|nov|num|núm)\.\s? +(?U)\b(n|N|Náut|N.B|neg|neol|nov|num|núm)\.\s? -\b(ord|out)\.\s? +(?U)\b(ord|out)\.\s? -\b(pág|págs|Paleont|part|pass|[Pp]edag|pejor|pess|Pesc|p|Pe|p.f|pl|pleb|p.m|poét|[Pp]olít|pop|pov|poss|p.p|p.p.m|pp|pref|prep|[Pp]rof|pron|P.S)\.\s? +(?U)\b(pág|págs|Paleont|part|pass|[Pp]edag|pejor|pess|Pesc|p|Pe|p.f|pl|pleb|p.m|poét|[Pp]olít|pop|pov|poss|p.p|p.p.m|pp|pref|prep|[Pp]rof|pron|P.S)\.\s? -\b(q.b|q.do|Q.E|Q.I|ql)\.\s? +(?U)\b(q.b|q.do|Q.E|Q.I|ql)\.\s? -\b(R|rel|Relig|Rev)\.\s? +(?U)\b(R|rel|Relig|Rev)\.\s? -\b(S|S.A|set|símb|S. ?M|[Ss]ra?s?|[Ss]rta|suf|superl)\.\s? +(?U)\b(S|S.A|set|símb|S. ?M|[Ss]ra?s?|[Ss]rta|suf|superl)\.\s? -\b(t|tip|Tip|tít|top|[Tt]opogr|tr|trad|Trás-os-M|trim)\.\s? +(?U)\b(t|tip|Tip|tít|top|[Tt]opogr|tr|trad|Trás-os-M|trim)\.\s? -\b(Univ)\.\s? +(?U)\b(Univ)\.\s? -\b(v|V|vd|vid|voc|vol|V.S|vs|vulg)\.\s? +(?U)\b(v|V|vd|vid|voc|vol|V.S|vs|vulg)\.\s? -\b(Zool)\.\s? +(?U)\b(Zool)\.\s? -\bs([eé]c)?\.\s? +(?U)\bs([eé]c)?\.\s? [IVXDMCL]+ -\b(Mr|Mrs|No|pp|St|Jr|Bros|etc|vs|esp|[Ff]ig|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co|Ltd)\.\s? +(?U)\b(Mr|Mrs|No|pp|St|Jr|Bros|etc|vs|esp|[Ff]ig|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co|Ltd)\.\s? -\b(sp|spp)\.\s? +(?U)\b(sp|spp)\.\s? -\b[A-ZÀÉÈÍÓÒÚ]\.\s? +(?U)\b[A-ZÀÉÈÍÓÒÚ]\.\s? -\b[ad]\.\s? +(?U)\b[ad]\.\s? C\. @@ -6376,7 +6376,7 @@ S\. -\bP[Hh]\.\s? +(?U)\bP[Hh]\.\s? D\.? @@ -6395,27 +6395,27 @@ -\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s +(?U)\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s \p{Ll} -\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s +(?U)\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s \p{Ll} -\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s +(?U)\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s \p{Ll} -\bet al\.[\p{Pe}\p{Pf}\p{Pd}"”']*\s +(?U)\bet al\.[\p{Pe}\p{Pf}\p{Pd}"”']*\s -\b([Ee]sc|K[gm]s?|[mc]?[gml]s]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s +(?U)\b([Ee]sc|K[gm]s?|[mc]?[gml]s]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s \p{Ll} @@ -6466,7 +6466,7 @@ -\b(etc)\.\s? +(?U)\b(etc)\.\s? \p{Lu}\p{Ll}* diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/hyphenised.ent b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/hyphenised.ent index 8b17a479a3a0..fcc36aa2bbc3 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/hyphenised.ent +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/hyphenised.ent @@ -1,2 +1,2 @@ - + diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/pt.sor b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/pt.sor index 60eae1dc4509..f9f2cf948051 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/pt.sor +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/pt.sor @@ -178,7 +178,7 @@ f:(.*),(.*) \1\2 == ordinal-feminine == ([-−]?\d+) $(ordinal-feminine $(ordinal-masculine \1)) -(.*)o\b(.*) $(ordinal-feminine \1a\2) +(?U)(.*)o\b(.*) $(ordinal-feminine \1a\2) (.*) \1 == (ordinal)-number(-feminine|-masculine)? == From 1fdbfd8218eab4ae28227c7bf9e930ef7ab4f603 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 26 Oct 2024 13:01:33 -0400 Subject: [PATCH 04/11] [fr] fix \b regex for JDK>=19 --- .../org/languagetool/resource/segment.srx | 87 ++++++++++--------- .../org/languagetool/language/French.java | 6 +- .../rules/fr/MakeContractionsFilter.java | 8 +- .../tokenizers/fr/FrenchWordTokenizer.java | 4 +- 4 files changed, 53 insertions(+), 52 deletions(-) diff --git a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx index 66b535d20898..b199286410e7 100644 --- a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx +++ b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx @@ -5473,6 +5473,7 @@ \p{Lu}\p{Ll} + [\s\u00A0] @@ -5508,28 +5509,28 @@ -\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. -[A-Za-z0-9\-]+(\.|\b) +(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +(?U)[A-Za-z0-9\-]+(\.|\b) -\b[A-Za-z0-9\-]+\. -[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) +(?U)\b[A-Za-z0-9\-]+\. +(?U)[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) -\b[A-Za-z0-9\-]+\. -[A-Za-z]{2,5}(\.|\b) +(?U)\b[A-Za-z0-9\-]+\. +(?U)[A-Za-z]{2,5}(\.|\b) -\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0] +(?U)\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0] \p{Ll} -\b(etc)\.\)[\s\u00A0] +(?U)\b(etc)\.\)[\s\u00A0] -\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0] +(?U)\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0] @@ -5553,11 +5554,11 @@ -\b\p{L}\.[\s\u00A0] +(?U)\b\p{L}\.[\s\u00A0] \p{L}\.[\s\u00A0] -\b\p{L}\. +(?U)\b\p{L}\. \p{L}\. @@ -5585,11 +5586,11 @@ [^\p{Lu}] -\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0] +(?U)\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0] -\b\p{Lu}\.\p{Lu}\.[\s\u00A0] +(?U)\b\p{Lu}\.\p{Lu}\.[\s\u00A0] @@ -5597,11 +5598,11 @@ -\b(:?Blvd|Ave|Mts?)\.[\s\u00A0] +(?U)\b(:?Blvd|Ave|Mts?)\.[\s\u00A0] \p{Ll}+ -\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0] +(?U)\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0] \p{Ll}+ @@ -5614,39 +5615,39 @@ [A-Z]\.[A-Z]\. -[A-Z]\b +(?U)[A-Z]\b -\bL\. -A\b +(?U)\bL\. +(?U)A\b -\bU\. -[SK]\b +(?U)\bU\. +(?U)[SK]\b -\b[nN]o\.[\s\u00A0] +(?U)\b[nN]o\.[\s\u00A0] \p{N} -\bP[Hh]\.[\s\u00A0]? +(?U)\bP[Hh]\.[\s\u00A0]? D\.? -\be\.g\.[\s\u00A0] +(?U)\be\.g\.[\s\u00A0] -\bvs\.[\s\u00A0] +(?U)\bvs\.[\s\u00A0] -\b[Ee]tc\.[\s\u00A0] +(?U)\b[Ee]tc\.[\s\u00A0] [^\p{Lu}] -\b([Bb]tw|BTW)\.[\s\u00A0] +(?U)\b([Bb]tw|BTW)\.[\s\u00A0] @@ -5658,64 +5659,64 @@ 3|4|Buzz|Crozz -\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0] +(?U)\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0] -\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0] +(?U)\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0] -\bLL\.[\s\u00A0]?[BMD]\.[\s\u00A0] +(?U)\bLL\.[\s\u00A0]?[BMD]\.[\s\u00A0] -\b[BM]\.[\s\u00A0]? +(?U)\b[BM]\.[\s\u00A0]? Eng\.? -\bLL\.[\s\u00A0]? +(?U)\bLL\.[\s\u00A0]? [BMD]\.? -\b[BM]\.[\s\u00A0]? +(?U)\b[BM]\.[\s\u00A0]? Sc\.? -\b[BM]\.[\s\u00A0]? +(?U)\b[BM]\.[\s\u00A0]? Comp?\.? -\b[BM]\.[\s\u00A0]? +(?U)\b[BM]\.[\s\u00A0]? Arch\.? -\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0] +(?U)\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0] -\bI(nc|NC)\.[\s\u00A0] +(?U)\bI(nc|NC)\.[\s\u00A0] -\bCorp\.[\s\u00A0] +(?U)\bCorp\.[\s\u00A0] -\bBros\.[\s\u00A0] +(?U)\bBros\.[\s\u00A0] -\bLtd\.[\s\u00A0] +(?U)\bLtd\.[\s\u00A0] \p{Ll}+ -\bCo\.[\s\u00A0] +(?U)\bCo\.[\s\u00A0] -\bE\.[\s\u00A0] -\b[Cc]oli\b +(?U)\bE\.[\s\u00A0] +(?U)\b[Cc]oli\b @@ -5734,7 +5735,7 @@ -\b[0-9]+(\.|:)[0-9][0-9][\s\u00A0\u202F] +(?U)\b[0-9]+(\.|:)[0-9][0-9][\s\u00A0\u202F] diff --git a/languagetool-language-modules/fr/src/main/java/org/languagetool/language/French.java b/languagetool-language-modules/fr/src/main/java/org/languagetool/language/French.java index ceebef3a2005..3d431644aa81 100644 --- a/languagetool-language-modules/fr/src/main/java/org/languagetool/language/French.java +++ b/languagetool-language-modules/fr/src/main/java/org/languagetool/language/French.java @@ -50,9 +50,9 @@ public class French extends Language implements AutoCloseable { private static final String BEFORE_APOS = "([cjnmtsldCJNMTSLD]|qu|jusqu|lorsqu|puisqu|quoiqu|Qu|Jusqu|Lorsqu|Puisqu|Quoiqu|QU|JUSQU|LORSQU|PUISQU|QUOIQU)"; - private static final Pattern BEFORE_APOS_PATTERN_1 = compile("(\\b" + BEFORE_APOS + ")'"); - private static final Pattern BEFORE_APOS_PATTERN_2 = compile("(\\b" + BEFORE_APOS + ")’\""); - private static final Pattern BEFORE_APOS_PATTERN_3 = compile("(\\b" + BEFORE_APOS + ")’'"); + private static final Pattern BEFORE_APOS_PATTERN_1 = compile("(\\b" + BEFORE_APOS + ")'", Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern BEFORE_APOS_PATTERN_2 = compile("(\\b" + BEFORE_APOS + ")’\"", Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern BEFORE_APOS_PATTERN_3 = compile("(\\b" + BEFORE_APOS + ")’'", Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern TYPOGRAPHY_PATTERN_1 = compile("\u00a0;"); private static final Pattern TYPOGRAPHY_PATTERN_2 = compile("\u00a0!"); diff --git a/languagetool-language-modules/fr/src/main/java/org/languagetool/rules/fr/MakeContractionsFilter.java b/languagetool-language-modules/fr/src/main/java/org/languagetool/rules/fr/MakeContractionsFilter.java index 3837ddd01816..740de82b848e 100644 --- a/languagetool-language-modules/fr/src/main/java/org/languagetool/rules/fr/MakeContractionsFilter.java +++ b/languagetool-language-modules/fr/src/main/java/org/languagetool/rules/fr/MakeContractionsFilter.java @@ -26,10 +26,10 @@ public class MakeContractionsFilter extends AbstractMakeContractionsFilter { - private static final Pattern DE_LE = Pattern.compile("\\bde le\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); - private static final Pattern A_LE = Pattern.compile("\\bà le\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); - private static final Pattern DE_LES = Pattern.compile("\\bde les\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); - private static final Pattern A_LES = Pattern.compile("\\bà les\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + private static final Pattern DE_LE = Pattern.compile("\\bde le\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern A_LE = Pattern.compile("\\bà le\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern DE_LES = Pattern.compile("\\bde les\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern A_LES = Pattern.compile("\\bà les\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); protected String fixContractions(String suggestion) { Matcher matcher = DE_LE.matcher(suggestion); diff --git a/languagetool-language-modules/fr/src/main/java/org/languagetool/tokenizers/fr/FrenchWordTokenizer.java b/languagetool-language-modules/fr/src/main/java/org/languagetool/tokenizers/fr/FrenchWordTokenizer.java index 0564f2710cb0..14a5e651f40e 100644 --- a/languagetool-language-modules/fr/src/main/java/org/languagetool/tokenizers/fr/FrenchWordTokenizer.java +++ b/languagetool-language-modules/fr/src/main/java/org/languagetool/tokenizers/fr/FrenchWordTokenizer.java @@ -72,9 +72,9 @@ public class FrenchWordTokenizer extends WordTokenizer { private static final Pattern SPACE_DIGITS0 = Pattern.compile("([\\d]{4}) ", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); private static final Pattern SPACE_DIGITS = Pattern.compile("([\\d]) ([\\d][\\d][\\d])\\b", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern SPACE_DIGITS2 = Pattern.compile("([\\d]) ([\\d][\\d][\\d]) ([\\d][\\d][\\d])\\b", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern SPACE0 = Pattern.compile("xxFR_SPACE0xx"); private static final List doNotSplit = Arrays.asList("mers-cov", "mcgraw-hill", "sars-cov-2", "sars-cov", From c2e324259ccd8470b72b3c871f4e97b1333eaa73 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 26 Oct 2024 13:04:29 -0400 Subject: [PATCH 05/11] [es] fix \b regex for JDK>=19 --- .../org/languagetool/resource/segment.srx | 56 ++++++++++--------- .../org/languagetool/language/Spanish.java | 2 +- .../tokenizers/es/SpanishWordTokenizer.java | 2 +- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx index b199286410e7..97317656affe 100644 --- a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx +++ b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx @@ -4883,6 +4883,7 @@ »[^\u00A0\s\.:!?…] + @@ -4932,7 +4933,7 @@ -\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] +(?U)\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] @@ -4942,76 +4943,76 @@ -\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] +(?U)\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] \p{Ll} -\b(s|ca)\.[\s\u00A0] -[XIV]+\b +(?U)\b(s|ca)\.[\s\u00A0] +(?U)[XIV]+\b -\b(min|m|ca)\.[\s\u00A0] -[0-9]+\b +(?U)\b(min|m|ca)\.[\s\u00A0] +(?U)[0-9]+\b -\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0] -[XIV\d]+\b +(?U)\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0] +(?U)[XIV\d]+\b -\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] [\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll} -\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. -[A-Za-z0-9\-]+(\.|\b) +(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +(?U)[A-Za-z0-9\-]+(\.|\b) -\b[A-Za-z0-9\-]+\. -[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) +(?U)\b[A-Za-z0-9\-]+\. +(?U)[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) -\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0] +(?U)\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0] -\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] +(?U)\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] -\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0] +(?U)\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0] -\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0] +(?U)\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0] -\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0] +(?U)\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0] -\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0] +(?U)\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0] -\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] [\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll} -\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -\b\p{Lu}{2}\.[\s\u00A0]? +(?U)\b\p{Lu}{2}\.[\s\u00A0]? \p{Lu}{2} @@ -5020,17 +5021,17 @@ -\b([Ee]tc|m[aá]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b([Ee]tc|m[aá]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] -\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +(?U)\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} @@ -5038,7 +5039,7 @@ -\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] +(?U)\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] [¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}* @@ -5047,6 +5048,7 @@ »[^\u00A0\s\.:!?…] + (?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. diff --git a/languagetool-language-modules/es/src/main/java/org/languagetool/language/Spanish.java b/languagetool-language-modules/es/src/main/java/org/languagetool/language/Spanish.java index 58bb5bfda4a2..2cfc141199d4 100644 --- a/languagetool-language-modules/es/src/main/java/org/languagetool/language/Spanish.java +++ b/languagetool-language-modules/es/src/main/java/org/languagetool/language/Spanish.java @@ -315,7 +315,7 @@ public boolean hasMinMatchesRules() { return true; } - private static final Pattern ES_CONTRACTIONS = Pattern.compile("\\b([Aa]|[Dd]e) e(l)\\b"); + private static final Pattern ES_CONTRACTIONS = Pattern.compile("\\b([Aa]|[Dd]e) e(l)\\b", Pattern.UNICODE_CHARACTER_CLASS); @Override public String adaptSuggestion(String replacement) { diff --git a/languagetool-language-modules/es/src/main/java/org/languagetool/tokenizers/es/SpanishWordTokenizer.java b/languagetool-language-modules/es/src/main/java/org/languagetool/tokenizers/es/SpanishWordTokenizer.java index 7b7a114f356f..0bd7594ecec2 100644 --- a/languagetool-language-modules/es/src/main/java/org/languagetool/tokenizers/es/SpanishWordTokenizer.java +++ b/languagetool-language-modules/es/src/main/java/org/languagetool/tokenizers/es/SpanishWordTokenizer.java @@ -43,7 +43,7 @@ public class SpanishWordTokenizer extends WordTokenizer { // decimal comma between digits private static final Pattern DECIMAL_COMMA= Pattern.compile("([\\d]),([\\d])",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); // ordinals - private static final Pattern ORDINAL_POINT= Pattern.compile("\\b([\\d]+)\\.(º|ª|o|a|er|os|as)\\b",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); + private static final Pattern ORDINAL_POINT= Pattern.compile("\\b([\\d]+)\\.(º|ª|o|a|er|os|as)\\b",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern PATTERN_1 = Pattern.compile("xxES_DECIMAL_POINTxx", Pattern.LITERAL); private static final Pattern PATTERN_2 = Pattern.compile("xxES_DECIMAL_COMMAxx", Pattern.LITERAL); private static final Pattern PATTERN_3 = Pattern.compile("xxES_ORDINAL_POINTxx", Pattern.LITERAL); From 7c78310d53c32a69f4d2ca9bad815688b91b5cac Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 26 Oct 2024 13:13:35 -0400 Subject: [PATCH 06/11] [ca] fix \b regex for JDK>=19 --- .../org/languagetool/language/Catalan.java | 30 +++++++++---------- .../rules/ca/PronomsFeblesHelper.java | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java b/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java index db813750b83d..7848b2faeef9 100644 --- a/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java +++ b/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java @@ -45,9 +45,9 @@ public class Catalan extends Language { - private static final Pattern PATTERN_1 = compile("(\\b[lmnstdLMNSTD])'"); - private static final Pattern PATTERN_2 = compile("(\\b[lmnstdLMNSTD])’\""); - private static final Pattern PATTERN_3 = compile("(\\b[lmnstdLMNSTD])’'"); + private static final Pattern PATTERN_1 = compile("(\\b[lmnstdLMNSTD])'", Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern PATTERN_2 = compile("(\\b[lmnstdLMNSTD])’\"", Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern PATTERN_3 = compile("(\\b[lmnstdLMNSTD])’'", Pattern.UNICODE_CHARACTER_CLASS); @Override public String getName() { @@ -345,7 +345,7 @@ public SpellingCheckRule createDefaultSpellingRule(ResourceBundle messages) thro return new MorfologikCatalanSpellerRule(messages, this, null, Collections.emptyList()); } - private static final Pattern CA_OLD_DIACRITICS = compile(".*\\b(sóc|dóna|dónes|vénen|véns|fóra)\\b.*",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); + private static final Pattern CA_OLD_DIACRITICS = compile(".*\\b(sóc|dóna|dónes|vénen|véns|fóra)\\b.*",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CHARACTER_CLASS); private RuleMatch adjustCatalanMatch(RuleMatch ruleMatch, Set enabledRules) { String errorStr = ruleMatch.getOriginalErrorStr(); @@ -419,27 +419,27 @@ private String removeOldDiacritics(String s) { .replace("Fóra", "Fora"); } - private static final Pattern CA_CONTRACTIONS = compile("\\b([Aa]|[Dd]e) e(ls?)\\b"); - private static final Pattern CA_APOSTROPHES1 = compile("\\b([LDNSTMldnstm]['’]) "); + private static final Pattern CA_CONTRACTIONS = compile("\\b([Aa]|[Dd]e) e(ls?)\\b", Pattern.UNICODE_CHARACTER_CLASS); + private static final Pattern CA_APOSTROPHES1 = compile("\\b([LDNSTMldnstm]['’]) ", Pattern.UNICODE_CHARACTER_CLASS); // exceptions: l'FBI, l'statu quo - private static final Pattern CA_APOSTROPHES2 = compile("\\b([mtlsn])['’]([^1haeiouáàèéíòóúA-ZÀÈÉÍÒÓÚ“«\"])"); + private static final Pattern CA_APOSTROPHES2 = compile("\\b([mtlsn])['’]([^1haeiouáàèéíòóúA-ZÀÈÉÍÒÓÚ“«\"])", Pattern.UNICODE_CHARACTER_CLASS); // exceptions: el iogurt, la essa private static final Pattern CA_APOSTROPHES3 = compile("\\be?([mtsldn])e? (h?[aeiouàèéíòóú])", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern CA_APOSTROPHES4 = compile("\\b(l)a ([aeoàúèéí][^ ])", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern CA_APOSTROPHES5 = compile("\\b([mts]e) (['’])", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern CA_APOSTROPHES6 = compile("\\bs'e(ns|ls)\\b", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern CA_APOSTROPHES7 = compile("\\b(de|a)l (h?[aeoàúèéí][^ ])", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern POSSESSIUS_v = compile("\\b([mtsMTS]e)v(a|es)\\b", - Pattern.UNICODE_CASE); + Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern POSSESSIUS_V = compile("\\b([MTS]E)V(A|ES)\\b", - Pattern.UNICODE_CASE); + Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern CA_REMOVE_SPACES = compile("\\b(a|de|pe) (ls? )", - Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); @Override public String adaptSuggestion(String s) { diff --git a/languagetool-language-modules/ca/src/main/java/org/languagetool/rules/ca/PronomsFeblesHelper.java b/languagetool-language-modules/ca/src/main/java/org/languagetool/rules/ca/PronomsFeblesHelper.java index fdf6b7335a4b..7cc241fc3520 100644 --- a/languagetool-language-modules/ca/src/main/java/org/languagetool/rules/ca/PronomsFeblesHelper.java +++ b/languagetool-language-modules/ca/src/main/java/org/languagetool/rules/ca/PronomsFeblesHelper.java @@ -553,7 +553,7 @@ public static String convertPronounsForIntransitiveVerb(String s) { private static Pattern pronoun_wrong_apostrophation = Pattern.compile("([mts])'([^aeiouh].*)", Pattern.CASE_INSENSITIVE); private static Pattern pronoun_missing_apostrophation = Pattern.compile("(.*)\\be([stm]) (h?[aeiouh].*)", - Pattern.CASE_INSENSITIVE); + Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CHARACTER_CLASS); private static Pattern pronoun_wrong_hypphen = Pattern.compile("(.*)(-[stm])e-(h[oi])", Pattern.CASE_INSENSITIVE); From be43eed8cf12535332acc0a828c29ccffe0f0549 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sat, 26 Oct 2024 13:31:08 -0400 Subject: [PATCH 07/11] [de] fix for \b regexp with JDK>=19 --- .../de/src/main/java/org/languagetool/language/German.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/languagetool-language-modules/de/src/main/java/org/languagetool/language/German.java b/languagetool-language-modules/de/src/main/java/org/languagetool/language/German.java index a9e70b6dc6c9..287c770c37b7 100644 --- a/languagetool-language-modules/de/src/main/java/org/languagetool/language/German.java +++ b/languagetool-language-modules/de/src/main/java/org/languagetool/language/German.java @@ -55,7 +55,7 @@ */ public class German extends Language implements AutoCloseable { - private static final Pattern TYPOGRAPHY_PATTERN = compile("\\b([a-zA-Z]\\.)([a-zA-Z]\\.)"); + private static final Pattern TYPOGRAPHY_PATTERN = compile("\\b([a-zA-Z]\\.)([a-zA-Z]\\.)", Pattern.UNICODE_CHARACTER_CLASS); private static final Pattern AI_DE_GGEC_MISSING_PUNCT = compile("AI_DE_GGEC_MISSING_PUNCTUATION_\\d+_DASH_J(_|AE)HRIG|AI_DE_GGEC_REPLACEMENT_CONFUSION", Pattern.CASE_INSENSITIVE); From 129852d5603fb9602fbd0a8bbf32634d8fb4770d Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sun, 27 Oct 2024 16:04:20 -0400 Subject: [PATCH 08/11] use new segment parameter for \b fix --- .../org/languagetool/tokenizers/SrxTools.java | 4 +- .../org/languagetool/resource/segment.srx | 382 +++++++++--------- pom.xml | 2 +- 3 files changed, 192 insertions(+), 196 deletions(-) diff --git a/languagetool-core/src/main/java/org/languagetool/tokenizers/SrxTools.java b/languagetool-core/src/main/java/org/languagetool/tokenizers/SrxTools.java index 96d7e9c8c146..d5331052bb65 100644 --- a/languagetool-core/src/main/java/org/languagetool/tokenizers/SrxTools.java +++ b/languagetool-core/src/main/java/org/languagetool/tokenizers/SrxTools.java @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; /** * Tools for loading an SRX tokenizer file. @@ -59,7 +60,8 @@ static SrxDocument createSrxDocument(String path) { static List tokenize(String text, SrxDocument srxDocument, String code) { List segments = new ArrayList<>(); - TextIterator textIterator = new SrxTextIterator(srxDocument, code, text); + Map parserParameters = Map.of(SrxTextIterator.DEFAULT_PATTERN_FLAGS_PARAMETER, Pattern.UNICODE_CHARACTER_CLASS); + TextIterator textIterator = new SrxTextIterator(srxDocument, code, text, parserParameters); while (textIterator.hasNext()) { segments.add(textIterator.next()); } diff --git a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx index 97317656affe..13542dfe0e75 100644 --- a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx +++ b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx @@ -1459,7 +1459,7 @@ -(?U)\b\d+\.\s +\b\d+\.\s \p{Ll}|\p{Lu}{2,} @@ -1471,11 +1471,11 @@ -(?U)\b\p{L}\.\s +\b\p{L}\.\s \p{L}\.\s -(?U)\b\p{L}\. +\b\p{L}\. \p{L}\. @@ -1495,39 +1495,39 @@ [^\p{Lu}] -(?U)\b(etc|șamd)\.\s +\b(etc|șamd)\.\s [A-Z] -(?U)\b(pag|leg|art)\.\s +\b(pag|leg|art)\.\s -(?U)\b(ian|febr?|mart?|apr|iu[nl]|aug|sept?|oct|nov|dec)\.\s +\b(ian|febr?|mart?|apr|iu[nl]|aug|sept?|oct|nov|dec)\.\s [^\p{Lu}] -(?U)\bdpdv\.\s +\bdpdv\.\s -(?U)\b(etc|șamd)\.\s +\b(etc|șamd)\.\s -(?U)\b(M)\. +\b(M)\. Ap\.N\.\s -(?U)\b(M)\.Ap\. +\b(M)\.Ap\. N\.\s -(?U)\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s +\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s -(?U)\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s[A-Z]\.\s +\b([Dd]l|[Dd]-na|[Dd]vs|[Pp]t)\.\s[A-Z]\.\s @@ -4424,7 +4424,6 @@ \p{Lu}\p{Ll} - \b\d+\.\s @@ -4476,16 +4475,16 @@ \p{Ll} -(?U)\b[0-9]+(руб|Руб|тыс|Тыс|трлн|млн|млрд)\.\s +\b[0-9]+(руб|Руб|тыс|Тыс|трлн|млн|млрд)\.\s \b[0-9]+ -(?U)\b(бульв|г|д|доп|др|е|зам|Зам|и|им|инд|исп|Исп)\.\s +\b(бульв|г|д|доп|др|е|зам|Зам|и|им|инд|исп|Исп)\.\s -(?U)\b(англ|в|вв|га|гг|гл|гос|грн|дм|долл|е|ед)\.\s +\b(англ|в|вв|га|гг|гл|гос|грн|дм|долл|е|ед)\.\s \p{Ll} @@ -4493,7 +4492,7 @@ -(?U)\b(кг|км|коп|л|лл|м|мг|мин|мл|млн|Млн|млрд|Млрд|мм)\.\s +\b(кг|км|коп|л|лл|м|мг|мин|мл|млн|Млн|млрд|Млрд|мм)\.\s \p{Ll} @@ -4505,7 +4504,7 @@ \p{Ll} -(?U)\b(руб|Руб|тыс|Тыс|трлн)\.\s +\b(руб|Руб|тыс|Тыс|трлн)\.\s \p{Ll} @@ -4513,7 +4512,7 @@ -(?U)\b(ч|чел|шт|экз)\.\s +\b(ч|чел|шт|экз)\.\s \p{Ll} @@ -4749,7 +4748,6 @@ \p{Lu}\p{Ll} - Yahoo![\s\u00A0] @@ -4795,43 +4793,43 @@ -(?U)\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] +\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] -(?U)\b(dc|inst|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] +\b(dc|inst|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] -(?U)\b(s|ca)\.[\s\u00A0] -(?U)[XIV]+\b +\b(s|ca)\.[\s\u00A0] +[XIV]+\b -(?U)\b(min|m|ca)\.[\s\u00A0] -(?U)[0-9]+\b +\b(min|m|ca)\.[\s\u00A0] +[0-9]+\b -(?U)\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0] -(?U)[XIV\d]+\b +\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0] +[XIV\d]+\b -(?U)\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] [\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll} -(?U)\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -(?U)\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -(?U)\b\p{Lu}{2}\.[\s\u00A0]? +\b\p{Lu}{2}\.[\s\u00A0]? \p{Lu}{2} @@ -4840,17 +4838,17 @@ -(?U)\b([Ee]tc|m[aáà]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b([Ee]tc|m[aáà]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -(?U)\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] -(?U)\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} @@ -4860,12 +4858,12 @@ -(?U)\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] +\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] \p{Ll} @@ -4874,7 +4872,7 @@ -(?U)\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] +\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] [¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}* @@ -4883,7 +4881,6 @@ »[^\u00A0\s\.:!?…] - @@ -4933,7 +4930,7 @@ -(?U)\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] +\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0] @@ -4943,76 +4940,76 @@ -(?U)\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] +\b(\.\.\.|…)[\p{Pe}»"’”][\s\u00A0] \p{Ll} -(?U)\b(s|ca)\.[\s\u00A0] -(?U)[XIV]+\b +\b(s|ca)\.[\s\u00A0] +[XIV]+\b -(?U)\b(min|m|ca)\.[\s\u00A0] -(?U)[0-9]+\b +\b(min|m|ca)\.[\s\u00A0] +[0-9]+\b -(?U)\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0] -(?U)[XIV\d]+\b +\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0] +[XIV\d]+\b -(?U)\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] [\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll} -(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. -(?U)[A-Za-z0-9\-]+(\.|\b) +\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +[A-Za-z0-9\-]+(\.|\b) -(?U)\b[A-Za-z0-9\-]+\. -(?U)[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) +\b[A-Za-z0-9\-]+\. +[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) -(?U)\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0] +\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0] -(?U)\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] +\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd|subg))\.[\s\u00A0] -(?U)\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0] +\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0] -(?U)\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0] +\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0] -(?U)\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0] +\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0] -(?U)\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0] +\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0] -(?U)\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b(Ltd|[Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|grs?|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] [\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll} -(?U)\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -(?U)\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -(?U)\b\p{Lu}{2}\.[\s\u00A0]? +\b\p{Lu}{2}\.[\s\u00A0]? \p{Lu}{2} @@ -5021,17 +5018,17 @@ -(?U)\b([Ee]tc|m[aá]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b([Ee]tc|m[aá]x|m[ií]n|aprox|long|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} -(?U)\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] -(?U)\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] +\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0] \p{Ll} @@ -5039,7 +5036,7 @@ -(?U)\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] +\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0] [¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}* @@ -5048,19 +5045,18 @@ »[^\u00A0\s\.:!?…] - -(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. -(?U)[A-Za-z0-9\-]+(\.|\b) +\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +[A-Za-z0-9\-]+(\.|\b) -(?U)\b[Se]even\. -(?U)[Oo]nes?\b +\b[Se]even\. +[Oo]nes?\b -(?U)\b[A-Za-z0-9\-]+\. -(?U)[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) +\b[A-Za-z0-9\-]+\. +[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) @@ -5110,12 +5106,12 @@ Klässler[sn]? -(?U)\bP[Hh]\. +\bP[Hh]\. D\. -(?U)\b\p{L}\. +\b\p{L}\. -(?U)\b\d+\.[\u00A0\s]{1,2} +\b\d+\.[\u00A0\s]{1,2} \p{Ll}|\p{Lu}{2,} @@ -5197,45 +5193,45 @@ -(?U)\b(Mrs?|No|pp|St|no|Sr|Jr|[Ss]ek|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]{1,2} +\b(Mrs?|No|pp|St|no|Sr|Jr|[Ss]ek|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]{1,2} -(?U)\b(spp?)\.[\u00A0\s]{1,2} +\b(spp?)\.[\u00A0\s]{1,2} -(?U)\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2} +\b(betr|Geb|Stk|ggü|Mag|mtl|Flgh?|[Pp]arl|Bsp|versch|[Dd]iesbzgl|[Zz]ykl|[Dd]bzgl[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Ee]ig|[Aa]bzü?gl|\d+-tlg|tlg|[Gg]gfls|[Ff]achspr|[Ll]tda|[Ee]inschl|[Vv]mtl|[Ss]tellv|Ev|[Bb]ezgl|lit|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|wsl|vsl|Bez|Bhf|Blvd|[Bb]spw|btto|bw|Dtl|[Gg]esetzl|Dez|[Jj]gdfr|[Ee]ff)\.[\u00A0\s]{1,2} -(?U)\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|[Dd]t|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2} +\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|[Dd]t|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2} -(?U)\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]{1,2} +\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]{1,2} -(?U)\b[BM]\.[\u00A0\s]Sc\.[\u00A0\s] +\b[BM]\.[\u00A0\s]Sc\.[\u00A0\s] \p{Ll} -(?U)\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2} +\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2} -(?U)\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2} +\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2} -(?U)\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|[Gg]em|Pat|prov|Betr|lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mio|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]{1,2} +\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|[Gg]em|Pat|prov|Betr|lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mio|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]{1,2} -(?U)\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|[Ss]td?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]{1,2} +\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|[Ss]td?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]{1,2} @@ -5247,7 +5243,7 @@ [\-–][\u00A0\s]\d+ -(?U)\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|voraussichtl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2} +\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|voraussichtl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2} @@ -5269,7 +5265,6 @@ [“„] - @@ -5475,7 +5470,6 @@ \p{Lu}\p{Ll} - [\s\u00A0] @@ -5511,28 +5505,28 @@ -(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. -(?U)[A-Za-z0-9\-]+(\.|\b) +\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +[A-Za-z0-9\-]+(\.|\b) -(?U)\b[A-Za-z0-9\-]+\. -(?U)[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) +\b[A-Za-z0-9\-]+\. +[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) -(?U)\b[A-Za-z0-9\-]+\. -(?U)[A-Za-z]{2,5}(\.|\b) +\b[A-Za-z0-9\-]+\. +[A-Za-z]{2,5}(\.|\b) -(?U)\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0] +\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0] \p{Ll} -(?U)\b(etc)\.\)[\s\u00A0] +\b(etc)\.\)[\s\u00A0] -(?U)\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0] +\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0] @@ -5556,11 +5550,11 @@ -(?U)\b\p{L}\.[\s\u00A0] +\b\p{L}\.[\s\u00A0] \p{L}\.[\s\u00A0] -(?U)\b\p{L}\. +\b\p{L}\. \p{L}\. @@ -5588,11 +5582,11 @@ [^\p{Lu}] -(?U)\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0] +\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0] -(?U)\b\p{Lu}\.\p{Lu}\.[\s\u00A0] +\b\p{Lu}\.\p{Lu}\.[\s\u00A0] @@ -5600,11 +5594,11 @@ -(?U)\b(:?Blvd|Ave|Mts?)\.[\s\u00A0] +\b(:?Blvd|Ave|Mts?)\.[\s\u00A0] \p{Ll}+ -(?U)\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0] +\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0] \p{Ll}+ @@ -5617,39 +5611,39 @@ [A-Z]\.[A-Z]\. -(?U)[A-Z]\b +[A-Z]\b -(?U)\bL\. -(?U)A\b +\bL\. +A\b -(?U)\bU\. -(?U)[SK]\b +\bU\. +[SK]\b -(?U)\b[nN]o\.[\s\u00A0] +\b[nN]o\.[\s\u00A0] \p{N} -(?U)\bP[Hh]\.[\s\u00A0]? +\bP[Hh]\.[\s\u00A0]? D\.? -(?U)\be\.g\.[\s\u00A0] +\be\.g\.[\s\u00A0] -(?U)\bvs\.[\s\u00A0] +\bvs\.[\s\u00A0] -(?U)\b[Ee]tc\.[\s\u00A0] +\b[Ee]tc\.[\s\u00A0] [^\p{Lu}] -(?U)\b([Bb]tw|BTW)\.[\s\u00A0] +\b([Bb]tw|BTW)\.[\s\u00A0] @@ -5661,64 +5655,64 @@ 3|4|Buzz|Crozz -(?U)\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0] +\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0] -(?U)\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0] +\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0] -(?U)\bLL\.[\s\u00A0]?[BMD]\.[\s\u00A0] +\bLL\.[\s\u00A0]?[BMD]\.[\s\u00A0] -(?U)\b[BM]\.[\s\u00A0]? +\b[BM]\.[\s\u00A0]? Eng\.? -(?U)\bLL\.[\s\u00A0]? +\bLL\.[\s\u00A0]? [BMD]\.? -(?U)\b[BM]\.[\s\u00A0]? +\b[BM]\.[\s\u00A0]? Sc\.? -(?U)\b[BM]\.[\s\u00A0]? +\b[BM]\.[\s\u00A0]? Comp?\.? -(?U)\b[BM]\.[\s\u00A0]? +\b[BM]\.[\s\u00A0]? Arch\.? -(?U)\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0] +\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0] -(?U)\bI(nc|NC)\.[\s\u00A0] +\bI(nc|NC)\.[\s\u00A0] -(?U)\bCorp\.[\s\u00A0] +\bCorp\.[\s\u00A0] -(?U)\bBros\.[\s\u00A0] +\bBros\.[\s\u00A0] -(?U)\bLtd\.[\s\u00A0] +\bLtd\.[\s\u00A0] \p{Ll}+ -(?U)\bCo\.[\s\u00A0] +\bCo\.[\s\u00A0] -(?U)\bE\.[\s\u00A0] -(?U)\b[Cc]oli\b +\bE\.[\s\u00A0] +\b[Cc]oli\b @@ -5737,7 +5731,7 @@ -(?U)\b[0-9]+(\.|:)[0-9][0-9][\s\u00A0\u202F] +\b[0-9]+(\.|:)[0-9][0-9][\s\u00A0\u202F] @@ -5750,7 +5744,7 @@ -(?U)\b(в|у|на|за|з|із|зі|зо)(\.\.\.|…)[\h\v]* +\b(в|у|на|за|з|із|зі|зо)(\.\.\.|…)[\h\v]* \p{Lu} @@ -5763,12 +5757,12 @@ -(?U)\b\d{1,3}\.[\h]+ +\b\d{1,3}\.[\h]+ \p{Ll}|\p{Lu}{2,} -(?U)\b\p{Ll}+[.!?][\h\v]* +\b\p{Ll}+[.!?][\h\v]* \h*(([\(«]|[\[‐-―-][\h\v]*)?\p{Ll}) @@ -5777,17 +5771,17 @@ -(?U)\b\p{L}{1,2}\. +\b\p{L}{1,2}\. \p{L}{1,2}\. -(?U)\b[\u00A0\u202F]?[A-Z]\.[\h\v]? +\b[\u00A0\u202F]?[A-Z]\.[\h\v]? [A-Z][a-zA-Z'’.-]|[А-ЯІЇЄҐ]\. -(?U)(^[\h\v]*|\([\h\v]*|[«„"]|(\b[А-ЯІЇЄҐACEIHOPX]\.-))[А-ЯІЇЄҐA-Z]\.[\h\v]* +(^[\h\v]*|\([\h\v]*|[«„"]|(\b[А-ЯІЇЄҐACEIHOPX]\.-))[А-ЯІЇЄҐA-Z]\.[\h\v]* @@ -5811,12 +5805,12 @@ а до лютого 2020 р. — затвердити --> -(?U)\b([0-9]{2}|[0-9]{4})[\h\v]+р\.[\h\v]+ +\b([0-9]{2}|[0-9]{4})[\h\v]+р\.[\h\v]+ [\h\v]*[№0-9‐-―-] -(?U)(?<!\d[\h]*)\bр\.[\h\v]* +(?<!\d[\h]*)\bр\.[\h\v]* [\h]*(?!(На|Але|Так?)[\h\v]+)[А-ЯІЇЄҐA-Z][^\h] @@ -5831,29 +5825,29 @@ -(?U)\b([0-9]0|[0-9]{3}0)(-[мх])?рр\.[\h\v]* +\b([0-9]0|[0-9]{3}0)(-[мх])?рр\.[\h\v]* -(?U)\b(тис|млн|млрд|грн)\.[\h\v]* +\b(тис|млн|млрд|грн)\.[\h\v]* [\h\v]*(\d|[КМ]Вт) -(?U)\b(укр|рос|англ?|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк)?)\.[\h\v]* +\b(укр|рос|англ?|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк)?)\.[\h\v]* -(?U)\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]* +\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]* -(?U)\b(кін)\.[\h\v]* +\b(кін)\.[\h\v]* [а-яіїєґ0-9IXV]|[ІХ]+\b -(?U)\b[сС]т\.[\h\v] +\b[сС]т\.[\h\v] [\h]*(?!([АВУОІЄ]|На|Але|Так?)[\h\v]) @@ -5862,21 +5856,21 @@ -(?U)\bнар\.[\h\v]* +\bнар\.[\h\v]* ([0-9]|бл\.|арт\.) -(?U)\bдол\.[\h\v]* +\bдол\.[\h\v]* США -(?U)(?<!т\.[\h\v]?)\b[пд]\.[\h\v]* +(?<!т\.[\h\v]?)\b[пд]\.[\h\v]* -(?U)\b(див)\.[\h\v] +\b(див)\.[\h\v] [\h\v]*[^А-ЯІЇЄҐ] @@ -5888,20 +5882,20 @@ України (див. Зимові походи) --> -(?U)(\([^)]*|\[[^\]]*|,[\h\v]*)\b(див)\.[\h\v]* +(\([^)]*|\[[^\]]*|,[\h\v]*)\b(див)\.[\h\v]* -(?U)\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп|[сС]вт)\.[\h\v]* +\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп|[сС]вт)\.[\h\v]* [\h\v]*[А-ЯІЇЄҐA-Z] -(?U)(?<![іи]\s+)\bдр\.[\h\v]* +(?<![іи]\s+)\bдр\.[\h\v]* [\h\v]*[А-ЯІЇЄҐ] -(?U)\bМан\.[\h\v]* +\bМан\.[\h\v]* [\h\v]*([Сс]іті|[Юю]н) @@ -5911,7 +5905,7 @@ -(?U)\b([Аа]рт|[Мм]ал|[Рр]ис|[Сс]пр)\.[\h\v]* +\b([Аа]рт|[Мм]ал|[Рр]ис|[Сс]пр)\.[\h\v]* [\h\v]*(№[\h\v]*)?[0-9] @@ -5921,7 +5915,7 @@ -(?U)(?<!\d[\h\v]*)\bм\.[\h\v]* +(?<!\d[\h\v]*)\bм\.[\h\v]* [А-ЯІЇЄҐ][а-яіїєґ'] @@ -5945,7 +5939,7 @@ [\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)\.[\h\v]*[\)\]] -(?U)\b([Рр]ед)\.[\h\v]* +\b([Рр]ед)\.[\h\v]* [А-ЯІЇЄҐ] @@ -6241,129 +6235,129 @@ -(?U)\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. +\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\. [A-Za-z0-9\-]+(\.|\b) -(?U)\b[A-Za-z0-9\-]+\. +\b[A-Za-z0-9\-]+\. [A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b) -(?U)\b(a|Ab|abr|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|ago|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s? +\b(a|Ab|abr|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|ago|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s? -(?U)\b(Bot|barb|B\.el|Bibl|Biol|Bioquím|burl)\.\s? +\b(Bot|barb|B\.el|Bibl|Biol|Bioquím|burl)\.\s? -(?U)\b(ca|card|cat|caus|cf|cit|cód|comp|compar|conj|contr|coord|cop)\.\s? +\b(ca|card|cat|caus|cf|cit|cód|comp|compar|conj|contr|coord|cop)\.\s? -(?U)\b(D|def|dem|deprec|deriv|det|dez|disj|[Dd]ra?s?)\.\s? +\b(D|def|dem|deprec|deriv|det|dez|disj|[Dd]ra?s?)\.\s? -(?U)\b(Ecol|Econ|ed|elem|Eng|erud|estrang|ex|Ex)\.\s? +\b(Ecol|Econ|ed|elem|Eng|erud|estrang|ex|Ex)\.\s? -(?U)\b(etc)\.\s? +\b(etc)\.\s? \p{Ll} -(?U)\b(f|fam|Farm|fem|fev|fig|fin|fl|fr|frac)\.\s? +\b(f|fam|Farm|fem|fev|fig|fin|fl|fr|frac)\.\s? -(?U)\b(gén|geog|Geogr|Geol|Geom|gír|gloss|Gram)\.\s? +\b(gén|geog|Geogr|Geol|Geom|gír|gloss|Gram)\.\s? -(?U)\b(hab|hist|Hort)\.\s? +\b(hab|hist|Hort)\.\s? -(?U)\b(Ibid|id|i.e|incompat|indef|inf|infant|Inform|integr|interj|interr|intr|inv)\.\s? +\b(Ibid|id|i.e|incompat|indef|inf|infant|Inform|integr|interj|interr|intr|inv)\.\s? -(?U)\b(jan|jul|jun|Jorn|Jur)\.\s? +\b(jan|jul|jun|Jorn|Jur)\.\s? -(?U)\b(lat|Lat|Lda|Ling|Lit|liv|loc|log|Lóg|long)\.\s? +\b(lat|Lat|Lda|Ling|Lit|liv|loc|log|Lóg|long)\.\s? -(?U)\b(m|mai|mar|masc|Mat|máx|Mecân|[Mm]ed|Mil|mín|mult|Mús)\.\s? +\b(m|mai|mar|masc|Mat|máx|Mecân|[Mm]ed|Mil|mín|mult|Mús)\.\s? -(?U)\b(n|N|Náut|N.B|neg|neol|nov|num|núm)\.\s? +\b(n|N|Náut|N.B|neg|neol|nov|num|núm)\.\s? -(?U)\b(ord|out)\.\s? +\b(ord|out)\.\s? -(?U)\b(pág|págs|Paleont|part|pass|[Pp]edag|pejor|pess|Pesc|p|Pe|p.f|pl|pleb|p.m|poét|[Pp]olít|pop|pov|poss|p.p|p.p.m|pp|pref|prep|[Pp]rof|pron|P.S)\.\s? +\b(pág|págs|Paleont|part|pass|[Pp]edag|pejor|pess|Pesc|p|Pe|p.f|pl|pleb|p.m|poét|[Pp]olít|pop|pov|poss|p.p|p.p.m|pp|pref|prep|[Pp]rof|pron|P.S)\.\s? -(?U)\b(q.b|q.do|Q.E|Q.I|ql)\.\s? +\b(q.b|q.do|Q.E|Q.I|ql)\.\s? -(?U)\b(R|rel|Relig|Rev)\.\s? +\b(R|rel|Relig|Rev)\.\s? -(?U)\b(S|S.A|set|símb|S. ?M|[Ss]ra?s?|[Ss]rta|suf|superl)\.\s? +\b(S|S.A|set|símb|S. ?M|[Ss]ra?s?|[Ss]rta|suf|superl)\.\s? -(?U)\b(t|tip|Tip|tít|top|[Tt]opogr|tr|trad|Trás-os-M|trim)\.\s? +\b(t|tip|Tip|tít|top|[Tt]opogr|tr|trad|Trás-os-M|trim)\.\s? -(?U)\b(Univ)\.\s? +\b(Univ)\.\s? -(?U)\b(v|V|vd|vid|voc|vol|V.S|vs|vulg)\.\s? +\b(v|V|vd|vid|voc|vol|V.S|vs|vulg)\.\s? -(?U)\b(Zool)\.\s? +\b(Zool)\.\s? -(?U)\bs([eé]c)?\.\s? +\bs([eé]c)?\.\s? [IVXDMCL]+ -(?U)\b(Mr|Mrs|No|pp|St|Jr|Bros|etc|vs|esp|[Ff]ig|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co|Ltd)\.\s? +\b(Mr|Mrs|No|pp|St|Jr|Bros|etc|vs|esp|[Ff]ig|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co|Ltd)\.\s? -(?U)\b(sp|spp)\.\s? +\b(sp|spp)\.\s? -(?U)\b[A-ZÀÉÈÍÓÒÚ]\.\s? +\b[A-ZÀÉÈÍÓÒÚ]\.\s? -(?U)\b[ad]\.\s? +\b[ad]\.\s? C\. @@ -6379,7 +6373,7 @@ S\. -(?U)\bP[Hh]\.\s? +\bP[Hh]\.\s? D\.? @@ -6398,27 +6392,27 @@ -(?U)\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s +\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s \p{Ll} -(?U)\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s +\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"”']*\s \p{Ll} -(?U)\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s +\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s \p{Ll} -(?U)\bet al\.[\p{Pe}\p{Pf}\p{Pd}"”']*\s +\bet al\.[\p{Pe}\p{Pf}\p{Pd}"”']*\s -(?U)\b([Ee]sc|K[gm]s?|[mc]?[gml]s]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s +\b([Ee]sc|K[gm]s?|[mc]?[gml]s]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"”'’]*\s \p{Ll} @@ -6469,7 +6463,7 @@ -(?U)\b(etc)\.\s? +\b(etc)\.\s? \p{Lu}\p{Ll}* diff --git a/pom.xml b/pom.xml index ae7bdc0d64db..213d0dba67aa 100644 --- a/pom.xml +++ b/pom.xml @@ -191,7 +191,7 @@ 1.0 2.3.1 - 2.0.3 + 2.0.4 4.4 1.12.0 From bd299bd7291b3594b20c08d8d9419c18f6779ab8 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Mon, 28 Oct 2024 22:30:54 -0400 Subject: [PATCH 09/11] [de] fix for \b regexp with JDK>=19 --- .../rules/AbstractUnitConversionRule.java | 4 ++-- .../org/languagetool/rules/de/grammar.xml | 22 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractUnitConversionRule.java b/languagetool-core/src/main/java/org/languagetool/rules/AbstractUnitConversionRule.java index c69679c5b158..1fb4d08e284c 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractUnitConversionRule.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/AbstractUnitConversionRule.java @@ -80,7 +80,7 @@ public abstract class AbstractUnitConversionRule extends Rule { protected static final String NUMBER_REGEX = "(-?[0-9]{1,32}[0-9,.]{0,32})"; protected static final String NUMBER_REGEX_WITH_BOUNDARY = "(-?\\b[0-9]{1,32}[0-9,.]{0,32})"; - protected final Pattern numberRangePart = Pattern.compile(NUMBER_REGEX_WITH_BOUNDARY + "$"); + protected final Pattern numberRangePart = Pattern.compile(NUMBER_REGEX_WITH_BOUNDARY + "$", Pattern.UNICODE_CHARACTER_CLASS); private static final double DELTA = 1e-2; private static final double ROUNDING_DELTA = 0.05; @@ -196,7 +196,7 @@ protected String formatRounded(String s) { */ protected void addUnit(String pattern, Unit base, String symbol, double factor, boolean metric) { Unit unit = base.multiply(factor); - unitPatterns.put(Pattern.compile(NUMBER_REGEX_WITH_BOUNDARY + "[\\s\u00A0]{0," + WHITESPACE_LIMIT + "}" + pattern + "\\b"), unit); + unitPatterns.put(Pattern.compile(NUMBER_REGEX_WITH_BOUNDARY + "[\\s\u00A0]{0," + WHITESPACE_LIMIT + "}" + pattern + "\\b", Pattern.UNICODE_CHARACTER_CLASS), unit); unitSymbols.putIfAbsent(unit, new ArrayList<>()); unitSymbols.get(unit).add(symbol); if (metric && !metricUnits.contains(unit)) { diff --git a/languagetool-language-modules/de/src/main/resources/org/languagetool/rules/de/grammar.xml b/languagetool-language-modules/de/src/main/resources/org/languagetool/rules/de/grammar.xml index 9c7d99f1f4b3..154046592e59 100644 --- a/languagetool-language-modules/de/src/main/resources/org/languagetool/rules/de/grammar.xml +++ b/languagetool-language-modules/de/src/main/resources/org/languagetool/rules/de/grammar.xml @@ -79406,7 +79406,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Dr.phil. Hans Müller - (s|S)\.\s?(o|u)\. + (s|S)\.[ \t]?(o|u)\. &glalong; \1. \2. @@ -79415,7 +79415,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Das ist auch nicht gut, s. o. - (d|D)\.\s?h\. + (d|D)\.[ \t]?h\. &glalong; \1. h. @@ -79423,7 +79423,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Das ist falsch, d.h. schlecht. - (m|M)\.\s?E\. + (m|M)\.[ \t]?E\. &glalong; \1. E. @@ -79439,7 +79439,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Hans Müller, M.A. - (z|Z)\.\s?(B|K|T|Zt|Hd?)\. + (z|Z)\.[ \t]?(B|K|T|Zt|Hd?)\. &glalong; \1. \2. @@ -79449,7 +79449,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Das ist z. B. auch nicht gut. - (v|n)\.\s?Chr\. + (v|n)\.[ \t]?Chr\. &glalong; \1. Chr. @@ -79465,7 +79465,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Das Seminar beginnt um 16 Uhr c.t. - (u|o|i)\.\s?(ö|ä|a|dgl)\. + (u|o|i)\.[ \t]?(ö|ä|a|dgl)\. &glalong; \1. \2. @@ -79476,7 +79476,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Das hier u. dgl. ist auch nicht gut. - (e|i|n)\.\s?V\. + (e|i|n)\.[ \t]?V\. &glalong; \1. V. @@ -79485,7 +79485,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Sprechstunde n.V. - (u)\.\s?(v)\.\s?(m|a)\. + (u)\.\s?(v)\.[ \t]?(m|a)\. &glalong; \1. \2. \3. https://languagetool.org/insights/de/beitrag/vielmehr-viel-mehr/ @@ -79520,7 +79520,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Vgl. Radebrecher, a.a.O. - \bp\.\s?a\. + \bp\.[ \t]?a\. &glalong; p. a. https://de.wiktionary.org/wiki/p._a. @@ -79528,7 +79528,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Hier gibt es eine Rendite von 2 % p.a. - Dr\.\s?med\.\s?(dent|vet)\. + Dr\.\s?med\.[ \t]?(dent|vet)\. &glalong; Dr. med. \1. @@ -79538,7 +79538,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA Dr. med.vet. Hans Müller - Dr\.\s?rer\.\s?(nat|pol)\. + Dr\.\s?rer\.[ \t]?(nat|pol)\. &glalong; Dr. rer. \1. From 47c98059bf8ae7bddbe3ada4a6d71d1961f1932c Mon Sep 17 00:00:00 2001 From: Stefan Viol Date: Wed, 30 Oct 2024 11:41:26 +0100 Subject: [PATCH 10/11] update commandline tests --- .../commandline/AbstractSecurityTestCase.java | 76 ------------------- .../languagetool/commandline/MainTest.java | 64 ++++++++-------- 2 files changed, 30 insertions(+), 110 deletions(-) delete mode 100644 languagetool-commandline/src/test/java/org/languagetool/commandline/AbstractSecurityTestCase.java diff --git a/languagetool-commandline/src/test/java/org/languagetool/commandline/AbstractSecurityTestCase.java b/languagetool-commandline/src/test/java/org/languagetool/commandline/AbstractSecurityTestCase.java deleted file mode 100644 index 11e0ed1c7888..000000000000 --- a/languagetool-commandline/src/test/java/org/languagetool/commandline/AbstractSecurityTestCase.java +++ /dev/null @@ -1,76 +0,0 @@ -/* LanguageTool, a natural language style checker - * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de) - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 - * USA - */ -package org.languagetool.commandline; - -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import java.security.Permission; - -/** - * @author Charlie Collins (Maven Test Example from - * http://www.screaming-penguin.com/node/7570) - */ -public class AbstractSecurityTestCase { - - protected static class ExitException extends SecurityException { - private static final long serialVersionUID = 1L; - public final int status; - public ExitException(int status) { - super("There is no escape!"); - this.status = status; - } - } - - private static class NoExitSecurityManager extends SecurityManager { - @Override - public void checkPermission(@SuppressWarnings("unused") Permission perm) { - // allow anything. - } - - @Override - @SuppressWarnings("unused") - public void checkPermission(Permission perm, Object context) { - // allow anything. - } - - @Override - public void checkExit(int status) { - super.checkExit(status); - throw new ExitException(status); - } - } - - @Before - public void setUp() throws Exception { - System.setSecurityManager(new NoExitSecurityManager()); - } - - @After - public void tearDown() throws Exception { - System.setSecurityManager(null); - } - - //get rid of JUnit warning for this helper class - @Test - public void testSomething() { - } - -} diff --git a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java index f9e6c3f87917..24b2e4a0b966 100644 --- a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java +++ b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java @@ -23,15 +23,7 @@ import org.junit.Before; import org.junit.Test; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.PrintStream; -import java.io.PrintWriter; +import java.io.*; import java.nio.charset.StandardCharsets; import static org.hamcrest.CoreMatchers.is; @@ -42,7 +34,7 @@ * * @author Marcin Miłkowski */ -public class MainTest extends AbstractSecurityTestCase { +public class MainTest { private final File enTestFile; private final File xxRuleFile; @@ -113,8 +105,7 @@ public MainTest() throws IOException { } @Before - public void setUp() throws Exception { - super.setUp(); + public void setUp() { this.stdout = System.out; this.stderr = System.err; this.out = new ByteArrayOutputStream(); @@ -124,38 +115,33 @@ public void setUp() throws Exception { } @After - public void tearDown() throws Exception { + public void tearDown() { System.setOut(this.stdout); System.setErr(this.stderr); - super.tearDown(); } @Test public void testUsageMessage() throws Exception { - try { - String[] args = {"-h"}; - Main.main(args); - fail("LT should have exited with status 0!"); - } catch (ExitException e) { - String output = new String(this.out.toByteArray()); - assertTrue(output.contains("Usage: java -jar languagetool-commandline.jar")); - assertEquals("Exit status", 1, e.status); - } + Process process = new ProcessBuilder( + "java", "-cp", System.getProperty("java.class.path"), "org.languagetool.commandline.Main", "-h" + ).start(); + int exitCode = process.waitFor(); + String output = readProcessOutput(process); + assertTrue(output.contains("Usage: java -jar languagetool-commandline.jar")); + assertEquals("Exit status", 1, exitCode); } @Test public void testPrintLanguages() throws Exception { - try { - String[] args = {"--list"}; - Main.main(args); - fail("LT should have exited with status 0!"); - } catch (ExitException e) { - String output = new String(this.out.toByteArray()); - assertTrue(output.contains("German")); - assertTrue(output.contains("de-DE")); - assertTrue(output.contains("English")); - assertEquals("Exit status", 0, e.status); - } + Process process = new ProcessBuilder( + "java", "-cp", System.getProperty("java.class.path"), "org.languagetool.commandline.Main", "--list" + ).start(); + int exitCode = process.waitFor(); + String output = readProcessOutput(process); + assertTrue(output.contains("German")); + assertTrue(output.contains("de-DE")); + assertTrue(output.contains("English")); + assertEquals("Exit status", 0, exitCode); } @Test @@ -670,4 +656,14 @@ private String getExternalFalseFriends() { return xxFalseFriendFile.getAbsolutePath(); } + private String readProcessOutput(Process process) throws IOException { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) { + StringBuilder output = new StringBuilder(); + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append(System.lineSeparator()); + } + return output.toString(); + } + } } From 3d9433830df7bdf961cff2a4b2dc1558c7d8d176 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Tue, 5 Nov 2024 08:22:52 -0500 Subject: [PATCH 11/11] [uk] fix for a type in regex --- .../src/main/resources/org/languagetool/resource/segment.srx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx index 13542dfe0e75..7800a19810e4 100644 --- a/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx +++ b/languagetool-core/src/main/resources/org/languagetool/resource/segment.srx @@ -5839,7 +5839,7 @@ -\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]* +\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]ел|ч|част)\.[\h\v]*