forked from GeoKnow/FAGI-gis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpecificationConstants.java
777 lines (626 loc) · 22.9 KB
/
SpecificationConstants.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
package gr.athena.innovation.fagi.specification;
import java.text.Collator;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
/**
* Class with the specification constants used across the application.
*
* @author nkarag
*/
public class SpecificationConstants {
/**
* Connector/separator for building strings in FAGI.
*/
public static final char CONNECTOR = ' ';
/**
* Buffer size for fusion log output.
*/
public static final int FUSION_LOG_BUFFER_SIZE = 1000;
/**
* File permissions when copying input files with arbitrary permissions.
*/
public static final String POSIX_FILE_PERMISSIONS_STRING = "rw-r--r--";
/**
* Help message.
*/
public static final String HELP = "Usage:\n java -jar fagi.jar -spec <configFile>\n"
+ "-spec requires the config.xml file path\n";
/**
* Configuration class groups anything that has to do with the XML configuration constants.
*/
public static class Config {
/**
* Filename for the rules XML specification.
*/
public static final String RULES_XML = "rules.xml";
/**
* Filename for the rules XSD that describes the rules XML file.
*/
public static final String RULES_XSD = "rules.xsd";
/**
* Filename for the configuration XML file.
*/
public static final String CONFIG_XML = "config.xml";
/**
* Filename for the configuration XSD file that describes the configuration XML file.
*/
public static final String CONFIG_XSD = "config.xsd";
/**
* Name for input format tag in XML.
*/
public static final String INPUT_FORMAT = "inputFormat";
/**
* Name for output format tag in XML.
*/
public static final String OUTPUT_FORMAT = "outputFormat";
/**
* Name for locale tag in XML.
*/
public static final String LOCALE = "locale";
/**
* Name for similarity tag in XML.
*/
public static final String SIMILARITY = "similarity";
/**
* Name for fusion log tag in XML.
*/
public static final String FUSION_LOG = "fusionLog";
/**
* Name for stats mode in XML.
*/
public static final String STATS = "stats";
/**
* Name for light-stats mode in XML.
*/
public static final String LIGHT_STATS = "light";
/**
* Name for detailed-stats mode in XML.
*/
public static final String DETAILED_STATS = "detailed";
/**
* Name for rules (file-path) tag in XML.
*/
public static final String RULES = "rules";
/**
* Name for left dataset tag in XML.
*/
public static final String LEFT_DATASET = "left";
/**
* Name for right dataset tag in XML.
*/
public static final String RIGHT_DATASET = "right";
/**
* Name for target tag in XML.
*/
public static final String TARGET = "target";
/**
* Name for mode tag in XML.
*/
public static final String MODE = "mode";
/**
* Name for links tag in XML.
*/
public static final String LINKS = "links";
/**
* Name for links format tag in XML.
*/
public static final String LINKS_FORMAT = "linksFormat";
/**
* NT value for links format tag in XML.
*/
public static final String NT = "nt";
/**
* CSV value for links format tag in XML.
*/
public static final String CSV = "csv";
/**
* CSV-unique-links value for links format tag in XML.
*/
public static final String CSV_UNIQUE_LINKS = "csv-unique-links";
/**
* CSV-ensembles value for links input in XML.
*/
public static final String CSV_ENSEMBLES = "csv-ensembles";
/**
* Name for categories tag in XML.
*/
public static final String CATEGORIES = "categories";
/**
* Name for date tag in XML.
*/
public static final String DATE = "date";
/**
* Name for date format tag in XML.
*/
public static final String DATE_FORMAT = "yyyy-MM-dd";
/**
* Name for id tag in XML.
*/
public static final String ID = "id";
/**
* Name for file tag in XML.
*/
public static final String FILE = "file";
/**
* Name for endpoint tag in XML.
*/
public static final String ENDPOINT = "endpoint";
/**
* Name for output directory tag in XML.
*/
public static final String OUTPUT_DIR = "outputDir";
/**
* Name for fused tag in XML.
*/
public static final String FUSED = "fused";
/**
* Name for remaining tag in XML.
*/
public static final String REMAINING = "remaining";
/**
* Name for ambiguous tag in XML.
*/
public static final String AMBIGUOUS = "ambiguous";
/**
* Name for default fused filename.
*/
public static final String DEFAULT_FUSED_FILENAME = "fused.nt";
/**
* Name for default remaining filename.
*/
public static final String DEFAULT_REMAINING_FILENAME = "remaining.nt";
/**
* Name for statistics tag in XML.
*/
public static final String STATISTICS = "statistics";
/**
* Name for default ambiguous filename.
*/
public static final String DEFAULT_AMBIGUOUS_FILENAME = "ambiguous.nt";
/**
* Name for default statistics filename.
*/
public static final String DEFAULT_STATS_FILENAME = "stats.json";
/**
* Name for default fusionLog filename.
*/
public static final String DEFAULT_FUSION_LOG_FILENAME = "fusionLog.txt";
/**
* Name for verbose tag in XML.
*/
public static final String VERBOSE = "verbose";
/**
* Suffix for naming frequency-related files for dataset A.
*/
public static final String FREQ_SUFFIX_A = ".a.freq.txt";
/**
* Suffix for naming frequency-related files for dataset B.
*/
public static final String FREQ_SUFFIX_B = ".b.freq.txt";
/**
* Filename for extracted features CSV file.
*/
public static final String FEATURES_CSV = "features_export.csv";
/**
* ML group tag in configuration XML.
*/
public static final String ML = "ML";
/**
* File path for the ML validation model.
*/
public static final String VALIDATION = "validation";
/**
* File path for the ML name property model.
*/
public static final String NAME = "name";
/**
* File path for the ML address property model.
*/
public static final String ADDRESS = "address";
/**
* File path for the ML website property model.
*/
public static final String WEBSITE = "website";
/**
* File path for the ML phone property model.
*/
public static final String PHONE = "phone";
/**
* File path for the ML email property model.
*/
public static final String EMAIL = "email";
}
/**
* Rule class groups anything that has to do with the XML rule specification constants.
*/
public static class Rule {
/**
* Name for rule tag in XML.
*/
public static final String RULE = "rule";
/**
* Name for validation rule tag in XML.
*/
public static final String VALIDATION_RULE = "validationRule";
/**
* Name for default dataset action tag in XML.
*/
public static final String DEFAULT_DATASET_ACTION = "defaultDatasetAction";
/**
* Name for default action tag in XML.
*/
public static final String DEFAULT_ACTION = "defaultAction";
/**
* Name for property of dataset A tag in XML.
*/
public static final String PROPERTY_A = "propertyA";
/**
* Name for property B tag in XML.
*/
public static final String PROPERTY_B = "propertyB";
/**
* Name for external property tag in XML.
*/
public static final String EXTERNAL_PROPERTY = "externalProperty";
/**
* Name for condition tag in XML.
*/
public static final String CONDITION = "condition";
/**
* Name for action rule set tag in XML.
*/
public static final String ACTION_RULE_SET = "actionRuleSet";
/**
* Name for action rule tag in XML.
*/
public static final String ACTION_RULE = "actionRule";
/**
* Name for action tag in XML.
*/
public static final String ACTION = "action";
/**
* Name for expression tag in XML.
*/
public static final String EXPRESSION = "expression";
/**
* Name for function tag in XML.
*/
public static final String FUNCTION = "function";
/**
* Name for AND operation tag in XML.
*/
public static final String AND = "and";
/**
* Name for OR operation tag in XML.
*/
public static final String OR = "or";
/**
* Name for NOT operation tag in XML.
*/
public static final String NOT = "not";
/**
* Name indicating parameter from dataset a (left).
*/
public static final String A = "a";
/**
* Name indicating parameter from dataset b (right).
*/
public static final String B = "b";
/**
* Concatenation separator.
*/
public static final String CONCATENATION_SEP = ", ";
/**
* Functional properties tag in XML.
*/
public static final String FUNCTIONAL_PROPERTIES = "functionalProperties";
/**
* Non-functional properties tag in XML.
*/
public static final String NON_FUNCTIONAL_PROPERTIES = "nonFunctionalProperties";
}
/**
* Property mapping for constructing nodes.
*/
public static class Mapping {
public static final Map<String, String> PROPERTY_MAPPINGS;
static {
Map<String, String> map = new HashMap<>();
map.put("http://slipo.eu/def#openingHours", "timeSlot");
map.put("http://slipo.eu/def#fax", "fax");
map.put("http://slipo.eu/def#address", "address");
map.put("http://slipo.eu/def#phone", "phone");
map.put("http://slipo.eu/def#email", "email");
map.put("http://www.opengis.net/ont/geosparql#hasGeometry", "geom");
map.put("http://slipo.eu/def#name", "name");
map.put("http://slipo.eu/def#source", "sourceInfo");
PROPERTY_MAPPINGS = Collections.unmodifiableMap(map);
}
}
/**
* Properties used for fusion of ensembles.
*/
public static class Properties {
public static final String ADDRESS = "http://slipo.eu/def#address";
public static final String STREET = "http://slipo.eu/def#street";
public static final String NAME = "http://slipo.eu/def#name";
public static final String NAME_VALUE = "http://slipo.eu/def#nameValue";
public static final String HOMEPAGE = "http://slipo.eu/def#homepage";
}
/**
* Similarity class groups anything that has to do with similarity constants.
*/
public static class Similarity {
/**
* Defines the strength of the collator. Collator is used to compare words.
*/
public static final int COLLATOR_STRENGTH = Collator.IDENTICAL;
/**
* Similarity accepted error.
*/
public static final double SIMILARITY_ACCEPTED_ERROR = 0.05;
/**
* Similarity score is considered 1 if it is greater than this value.
*/
public static final double SIMILARITY_MAX = 0.999;
/**
* Similarity score is considered 0 if it is less than this value.
*/
public static final double SIMILARITY_MIN = 0.001;
/**
* 5 decimal digits rounding.
*/
public static final int ROUND_DECIMALS_5 = 5;
/**
* 3 decimal digits rounding.
*/
public static final int ROUND_DECIMALS_3 = 3;
/**
* 2 decimal digits rounding.
*/
public static final int ROUND_DECIMALS_2 = 2;
/**
* Name for Cosine metric.
*/
public static final String COSINE = "cosine";
/**
* Name for Jaccard metric.
*/
public static final String JACCARD = "jaccard";
/**
* Name for Levenshtein metric.
*/
public static final String LEVENSHTEIN = "levenshtein";
/**
* Name for Jaro metric.
*/
public static final String JARO = "jaro";
/**
* Name for Jaro-Winkler metric.
*/
public static final String JARO_WINKLER = "jarowinkler";
/**
* Name for Sorted Jaro-Winkler metric.
*/
public static final String SORTED_JARO_WINKLER = "sortedjarowinkler";
/**
* Name for longest common subsequence metric.
*/
public static final String LCS = "longestcommonsubsequence";
/**
* Name for 2-gram metric.
*/
public static final String GRAM_2 = "2Gram";
}
/**
* Evaluation class groups anything that has to do with constants used in the evaluation process.
*/
public static class Evaluation {
/**
* Default mismatch threshold.
*/
public static final double MISMATCH_THRESHOLD = 0.75;
/**
* Default base weight.
*/
public static Double BASE_WEIGHT = 0.7;
/**
* Default mismatch weight.
*/
public static Double MISMATCH_WEIGHT = 0.3;
/**
* Sum of base and mismatch weights.
*/
public static Double MERGED_BASE_MISMATCH_WEIGHT = 1.0;
/**
* Special terms weight.
*/
public static Double SPECIAL_TERMS_WEIGHT = 0.0;
/**
* Common special terms weight.
*/
public static Double COMMON_SPECIAL_TERM_WEIGHT = 0.0;
/**
* Lower case vowels. Used for checking the presence of vowels in words.
*/
public static final String LOWERCASE_VOWELS = "aeiouäöü";
}
/**
* Stat class groups anything that has to do with statistic constants.
*/
public static class Stats {
/**
* Delimeter used in the statistic process.
*/
public static final String DELIMETER = ".";
/**
* Names property constant.
*/
public static final String NAMES = "names";
/**
* Phones property constant.
*/
public static final String PHONES = "phones";
/**
* Empty property constant.
*/
public static final String EMPTY = "empty";
/**
* Non-empty property constant.
*/
public static final String NON_EMPTY = "nonEmpty";
/**
* Percent constant.
*/
public static final String PERCENT = "percent";
/**
* Total constant.
*/
public static final String TOTAL = "total";
/**
* Name for input dataset A (left).
*/
public static final String INPUT_A = "dataset_a";
/**
* Name for input dataset B (right).
*/
public static final String INPUT_B = "dataset_b";
}
/**
* Regex class contains values of all regexes used.
*/
public static class Regex {
//v0.1
/**
* Regex matching abbreviations 1.
*/
public static final String ABBR_REGEX = "\\b(?:[A-Z][a-z]*){2,}";
/**
* Regex matching abbreviations 2.
*/
public static final String ABBR_REGEX2 = "((?:[A-Z]\\.)+[A-Z]?|[a-zA-Z']+)";
/**
* Regex matching abbreviations 3.
*/
public static final String ABBR_REGEX3 = "\\b(?:[a-zA-Z]\\.){2,}";
/**
* Regex matching uppercase acronyms.
*/
public static final String UPPER_CASE_2 = "^(.*?[A-Z]){2,}";
/**
* Regex matching numeric strings.
*/
public static final String NUMERIC = "\\d+";
/**
* Regex matching non-numeric strings.
*/
public static final String NON_NUMERIC = "[^0-9]";
//public static final String NON_WORD_CHARACTERS_REGEX = "\\W";
//public static final String NON_WORD_EXCEPT_PARENTHESIS_REGEX = "[^(),a-zA-Z]";
//public static final String NON_WORD_EXCEPT_PARENTHESIS_REGEX_2 = "[^\\p{L}\\p{Nd}]+";
/**
* Regex matching punctuation except parenthesis.
*/
public static final String PUNCTUATION_EXCEPT_PARENTHESIS_REGEX = "[\\p{Punct}&&[^()]]";
//v0.2
//removes - _ / @
/**
* Regex matching custom symbols. Used for removing - _ / @ from literals.
*/
public static final String SPECIAL_CHARS = "[\\-\\_\\/\\@]";
/**
* Regex matching custom characters. Matches " , . - @
*/
public static final String SIMPLE_SPECIAL_CHARS = "[\\\"\\,\\.\\-\\_\\@]";
}
/**
* Functions class contains all name constants of the evaluation functions.
*/
public static class Functions {
/* Dates */
public static final String IS_DATE_KNOWN_FORMAT = "isdateknownformat";
public static final String IS_DATE_PRIMARY_FORMAT = "isdateprimaryformat";
public static final String IS_VALID_DATE = "isvaliddate";
public static final String DATES_ARE_SAME = "datesaresame";
/* String literals */
public static final String IS_LITERAL_ABBREVIATION = "isliteralabbreviation";
public static final String IS_SAME_NORMALIZED = "issamenormalized";
public static final String IS_SAME_SIMPLE_NORMALIZE = "issamesimplenormalize";
public static final String IS_SAME_CUSTOM_NORMALIZE = "issamecustomnormalize";
public static final String IS_LITERAL_NUMERIC = "isliteralnumeric";
public static final String LITERAL_CONTAINS = "literalcontains";
public static final String LITERAL_CONTAINS_THE_OTHER = "literalcontainstheother";
public static final String LITERAL_HAS_LANGUAGE_ANNOTATION = "literalhaslanguageannotation";
public static final String LITERALS_HAVE_SAME_LANG = "literalshavesamelanguageannotation";
/* Phone number literals */
public static final String IS_PHONE_NUMBER_PARSABLE = "isphonenumberparsable";
public static final String IS_SAME_PHONE_NUMBER = "issamephonenumber";
public static final String IS_SAME_PHONE_NUMBER_CUSTOM_NORMALIZE = "issamephonenumbercustomnormalize";
public static final String IS_SAME_PHONE_NUMBER_EXIT_CODE = "issamephonenumberusingexitcode";
public static final String PHONE_HAS_MORE_DIGITS = "phonehasmoredigits";
/* Property */
public static final String EXISTS = "exists";
public static final String NOT_EXISTS = "notexists";
/* Geometry literals */
public static final String IS_GEOMETRY_MORE_COMPLEX = "isgeometrymorecomplex";
public static final String IS_SAME_CENTROID = "issamecentroid";
public static final String IS_POINT_GEOMETRY = "ispointgeometry";
public static final String GEOMETRIES_INTERSECT = "geometriesintersect";
public static final String GEOMETRIES_CLOSER_THAN = "geometriescloserthan";
public static final String IS_GEOMETRY_COVERED_BY = "isgeometrycoveredby";
public static final String GEOMETRIES_HAVE_SAME_AREA = "geometrieshavesamearea";
}
/**
* Functions class contains all name constants of the normalize functions.
*/
public static class Normalize {
public static final String PHONE_NUMBER_NORMALIZER = "phonenumbernormalizer";
public static final String NORMALIZE_DATE_TO_FORMAT = "normalizedatetoformat";
public static final String NORMALIZE_ALPHABETICALLY = "alphabeticalnormalizer";
public static final String TO_LOWER_CASE = "tolowercase";
public static final String REMOVE_SPECIAL_CHARACTERS = "removespecialcharacters";
public static final String BASIC_NORMALIZER = "basicgenericnormalizer";
}
/**
* Constant for EPSG:4326 coordinate reference system.
*/
public static final String CRS_EPSG_4326 = "EPSG:4326";
/**
* Constant for EPSG:3857 coordinate reference system.
*/
public static final String CRS_EPSG_3857 = "EPSG:3857"; //google all
/**
* Constant for EPSG:4508 coordinate reference system.
*/
public static final String CRS_EPSG_4508 = "EPSG:4508"; //thai
/**
* Constant for EPSG:2100 coordinate reference system.
*/
public static final String CRS_EPSG_2100 = "EPSG:2100"; //greece
/**
* Constant for EPSG:5243 coordinate reference system.
*/
public static final String CRS_EPSG_5243 = "EPSG:5243"; //germany
/**
* Array constant with the date formats treat as "known date formats" in FAGI.
*/
public static final String[] DATE_FORMATS = {
"yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ssZ",
"yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'",
"yyyy-MM-dd'T'HH:mm:ss.SSSZ", "yyyy-MM-dd HH:mm:ss",
"MM/dd/yyyy HH:mm:ss", "MM/dd/yyyy'T'HH:mm:ss.SSS'Z'",
"MM/dd/yyyy'T'HH:mm:ss.SSSZ", "MM/dd/yyyy'T'HH:mm:ss.SSS",
"MM/dd/yyyy'T'HH:mm:ssZ", "MM/dd/yyyy'T'HH:mm:ss",
"yyyy:MM:dd HH:mm:ss", "yyyy-MM-dd",
"dd mm yyyy", "yyyy/MM/dd",
"dd-mm-yyyy", "dd-MM-yyyy",
"dd/mm/yyyy", "dd MM yyyy"};
/**
* Array constant with the date formats considered "primary" in FAGI.
*/
public static final String[] PRIMARY_DATE_FORMATS = {
"yyyy-MM-dd", "yyyy/MM/dd",
"dd-mm-yyyy", "dd-MM-yyyy",
"dd/mm/yyyy", "dd/MM/yyyy"};
}