Skip to content

Commit

Permalink
feat: finish implementing proper nouns
Browse files Browse the repository at this point in the history
  • Loading branch information
Moseco committed Aug 28, 2024
1 parent 89582cc commit f36c064
Show file tree
Hide file tree
Showing 17 changed files with 1,078 additions and 375 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ example/input_files/kanjidic2.xml
example/input_files/pitch_accents.txt
example/input_files/frequency_list.txt
example/input_files/enamdict_utf-8
example/input_files/mecab.zip

# Temporary and output files
example/temp_files/
Expand Down
130 changes: 111 additions & 19 deletions example/sagase_dictionary_example.dart
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import 'dart:io';
import 'package:archive/archive_io.dart';
import 'package:drift/drift.dart';
import 'package:path/path.dart' as path;
import 'package:sagase_dictionary/src/database.dart';
import 'package:sagase_dictionary/sagase_dictionary.dart';
import 'package:sagase_dictionary/src/dictionary_builder.dart';

void main() async {
Expand All @@ -17,9 +17,6 @@ void main() async {
Directory(tempFilesPath).createSync(recursive: true);
String outputFilesPath = path.join(examplePath, 'output_files');
Directory(outputFilesPath).createSync(recursive: true);
if (File(path.join(outputFilesPath, 'dictionary.zip')).existsSync()) {
File(path.join(outputFilesPath, 'dictionary.zip')).deleteSync();
}

print('Creating dictionary');

Expand Down Expand Up @@ -66,17 +63,119 @@ void main() async {
int kanjiCount = await database.kanjis.count().getSingle();
print('Kanji ${kanjiCount == 13108 ? "valid" : "INVALID"} - $kanjiCount');

// Export database to file
// Export database without proper nouns to file
await _exportAndCompressDatabase(
database,
path.join(tempFilesPath, SagaseDictionaryConstants.dictionaryDatabaseFile),
path.join(outputFilesPath, SagaseDictionaryConstants.dictionaryZip),
);

// Add proper nouns to database
print('Adding proper nouns...');
await DictionaryBuilder.createProperNounDictionary(
database,
File(path.join(inputFilesPath, 'enamdict_utf-8')).readAsStringSync(),
showProgress: true,
);

int properNounCount = await database.properNouns.count().getSingle();
print('\nProper noun count - $properNounCount');

// Export database with proper nouns to file
await _exportAndCompressDatabase(
database,
path.join(tempFilesPath, SagaseDictionaryConstants.dictionaryDatabaseFile),
path.join(
outputFilesPath,
SagaseDictionaryConstants.dictionaryWithProperNounsZip,
),
);

// Close database
await database.close();

// Open proper noun database
print('Creating proper noun dictionary...');
final properNounDatabase = AppDatabase();
await DictionaryBuilder.createProperNounDictionary(
properNounDatabase,
File(path.join(inputFilesPath, 'enamdict_utf-8')).readAsStringSync(),
showProgress: true,
);
print('');

// Export proper noun database
await _exportAndCompressDatabase(
properNounDatabase,
path.join(
tempFilesPath,
SagaseDictionaryConstants.properNounDictionaryDatabaseFile,
),
path.join(
outputFilesPath,
SagaseDictionaryConstants.properNounDictionaryZip,
),
);

// Close proper noun database
await properNounDatabase.close();

// Create required assets tar
print("Creating required assets tar");
final archive = Archive();

String dictionaryPath = path.join(
outputFilesPath,
SagaseDictionaryConstants.dictionaryZip,
);
String mecabPath = path.join(
inputFilesPath,
SagaseDictionaryConstants.mecabZip,
);

final dictionaryBytes = await File(dictionaryPath).readAsBytes();
final dictionaryArchiveFile = ArchiveFile(
SagaseDictionaryConstants.dictionaryZip,
dictionaryBytes.length,
dictionaryBytes);
archive.addFile(dictionaryArchiveFile);

final mecabBytes = await File(mecabPath).readAsBytes();
final mecabArchiveFile = ArchiveFile(
SagaseDictionaryConstants.mecabZip,
mecabBytes.length,
mecabBytes,
);
archive.addFile(mecabArchiveFile);

final encodedArchive = TarEncoder().encode(archive);

File(path.join(
outputFilesPath,
SagaseDictionaryConstants.requiredAssetsTar,
)).writeAsBytesSync(encodedArchive);

// Delete temp files
await Directory(tempFilesPath).delete(recursive: true);

print('Done!');
}

Future<void> _exportAndCompressDatabase(
AppDatabase database,
String dbFilePath,
String archiveFilePath,
) async {
print('Exporting...');
final file = File(path.join(tempFilesPath, 'dictionary.sqlite'));
if (file.existsSync()) file.deleteSync();
await database.customStatement('VACUUM INTO ?', [file.path]);
final dbFile = File(dbFilePath);
if (dbFile.existsSync()) dbFile.deleteSync();
await database.customStatement('VACUUM INTO ?', [dbFile.path]);

// Compress the exported file
print('Compressing...');
final bytes =
await File(path.join(tempFilesPath, 'dictionary.sqlite')).readAsBytes();
final archiveFile = ArchiveFile('dictionary.sqlite', bytes.length, bytes);
final bytes = dbFile.readAsBytesSync();
final archiveFile =
ArchiveFile(dbFile.uri.pathSegments.last, bytes.length, bytes);
final archive = Archive();
archive.addFile(archiveFile);
final encodedArchive =
Expand All @@ -85,12 +184,5 @@ void main() async {
print('Compression did not work');
return;
}
await File(path.join(outputFilesPath, 'dictionary.zip'))
.writeAsBytes(encodedArchive);

// Close database and delete the temp files
await database.close();
await Directory(tempFilesPath).delete(recursive: true);

print('Done!');
File(archiveFilePath).writeAsBytesSync(encodedArchive);
}
1 change: 1 addition & 0 deletions lib/sagase_dictionary.dart
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export 'src/datamodels/kanjis.dart' show Kanji, KanjiReading;
export 'src/datamodels/my_dictionary_lists.dart' show MyDictionaryList;
export 'src/datamodels/predefined_dictionary_lists.dart'
show PredefinedDictionaryList;
export 'src/datamodels/proper_nouns.dart' show ProperNoun;
export 'src/datamodels/spaced_repetition_datas.dart' show SpacedRepetitionData;
export 'src/datamodels/vocabs.dart' show Vocab, VocabExample, VocabReference;

Expand Down
193 changes: 193 additions & 0 deletions lib/src/dao/proper_nouns_dao.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import 'package:drift/drift.dart';
import 'package:kana_kit/kana_kit.dart';
import 'package:sagase_dictionary/src/database.dart';
import 'package:sagase_dictionary/src/datamodels/proper_nouns.dart';
import 'package:sagase_dictionary/src/utils/string_utils.dart';

part 'proper_nouns_dao.g.dart';

@DriftAccessor(tables: [ProperNouns])
class ProperNounsDao extends DatabaseAccessor<AppDatabase>
with _$ProperNounsDaoMixin {
final _kanaKit = const KanaKit().copyWithConfig(passRomaji: true);

ProperNounsDao(super.db);

Future<void> importProperNouns(String path) async {
await db.customStatement('ATTACH DATABASE ? AS proper_noun_db', [path]);
await db.transaction(() async {
await db.customStatement(
'INSERT INTO ${db.properNouns.actualTableName} SELECT * FROM proper_noun_db.${db.properNouns.actualTableName}',
);
await db.customStatement(
'INSERT INTO ${db.properNounRomajiWords.actualTableName} SELECT * FROM proper_noun_db.${db.properNounRomajiWords.actualTableName}',
);
});
await db.customStatement('DETACH DATABASE proper_noun_db');
}

Future<void> deleteProperNouns() async {
await db.delete(db.properNouns).go();
await db.delete(db.properNounRomajiWords).go();
}

Future<List<ProperNoun>> getByWriting(String text) async {
return (db.select(db.properNouns)
..where(
(properNoun) => Expression.or([
properNoun.writing.equals(text),
properNoun.writingSearchForm.equals(
_kanaKit.toHiragana(text.toLowerCase().romajiToHalfWidth())),
]),
))
.get();
}

Future<List<ProperNoun>> getByReading(String text) async {
return (db.select(db.properNouns)
..where((properNoun) => Expression.or([
properNoun.reading.equals(text),
properNoun.readingSearchForm.equals(_kanaKit.toHiragana(text)),
])))
.get();
}

Future<List<ProperNoun>> getByWritingAndReading(
String writing,
String reading,
) {
return (db.select(db.properNouns)
..where((properNoun) => Expression.and([
Expression.or([
properNoun.writing.equals(writing),
properNoun.writingSearchForm.equals(_kanaKit
.toHiragana(writing.toLowerCase().romajiToHalfWidth())),
]),
Expression.or([
properNoun.reading.equals(reading),
properNoun.readingSearchForm
.equals(_kanaKit.toHiragana(reading)),
]),
])))
.get();
}

Future<List<ProperNoun>> search(String text) async {
final cleanedText = RegExp.escape(text).toLowerCase().removeDiacritics();

if (_kanaKit.isRomaji(cleanedText)) {
// Romaji
final splits = cleanedText.splitWords();

if (splits.length == 1) {
final lengthColumn = db.properNouns.romaji.length.iif(
db.properNouns.romaji.collate(Collate.noCase).like('$cleanedText%'),
db.properNouns.readingRomaji.length,
);

return (db.select(db.properNouns).join([
leftOuterJoin(
db.properNounRomajiWords,
db.properNounRomajiWords.properNounId.equalsExp(db.properNouns.id),
)
])
..where(Expression.or([
db.properNounRomajiWords.word.like('$cleanedText%'),
db.properNouns.readingRomaji.like('$cleanedText%'),
db.properNouns.readingRomajiSimplified.like('$cleanedText%'),
db.properNouns.romaji
.collate(Collate.noCase)
.like('$cleanedText%'),
]))
..orderBy([OrderingTerm.asc(lengthColumn)])
..groupBy([db.properNouns.id])
..limit(500))
.map((row) => row.readTable(db.properNouns))
.get();
} else {
// Create a join that matches all but the last word and starts with for the last word
// Then use having to exclude results that don't contain all words
final uniqueWords = splits.toSet().toList();
// Last word in splits might match another word so make sure to separate words correctly
late final List<String> wordsExceptLast;
if (splits.where((e) => e == splits.last).length > 1) {
wordsExceptLast = uniqueWords;
} else {
wordsExceptLast = uniqueWords.sublist(0, uniqueWords.length - 1);
}
final startsWithLastWord = '${splits.last}%';
final minStartsWithLastWordLength = db.properNounRomajiWords.word.length
.min(
filter: db.properNounRomajiWords.word.like(startsWithLastWord));

return (db.select(db.properNouns).join([
innerJoin(
db.properNounRomajiWords,
db.properNounRomajiWords.properNounId.equalsExp(db.properNouns.id),
),
])
..addColumns([minStartsWithLastWordLength])
..where(Expression.or([
db.properNounRomajiWords.word.isIn(wordsExceptLast),
db.properNounRomajiWords.word.like(startsWithLastWord),
]))
..orderBy([
OrderingTerm.asc(
minStartsWithLastWordLength,
nulls: NullsOrder.last,
),
])
..groupBy(
[db.properNouns.id],
having: Expression.and([
db.properNounRomajiWords.word
.caseMatch(when: {
for (var w in wordsExceptLast) Variable(w): Variable(w)
})
.count(distinct: true)
.equals(wordsExceptLast.length),
CaseWhenExpression(cases: [
CaseWhen(
db.properNounRomajiWords.word.like(startsWithLastWord),
then: const Constant(0),
),
]).count().isBiggerOrEqualValue(1),
]),
)
..limit(500))
.map((row) => row.readTable(db.properNouns))
.get();
}
} else {
// Japanese text
if (_kanaKit.isKana(cleanedText)) {
// Search by reading
return (db.select(db.properNouns)
..where((properNoun) => Expression.or([
properNoun.reading.like('$cleanedText%'),
properNoun.readingSearchForm
.like(_kanaKit.toHiragana('$cleanedText%')),
]))
..orderBy([
(properNoun) => OrderingTerm.asc(properNoun.reading.length),
])
..limit(500))
.get();
} else {
// Search by writing
return (db.select(db.properNouns)
..where((properNoun) => Expression.or([
properNoun.writing.like('$cleanedText%'),
properNoun.writingSearchForm.like(_kanaKit.toHiragana(
'$cleanedText%'.toLowerCase().romajiToHalfWidth())),
]))
..orderBy([
(properNoun) => OrderingTerm.asc(properNoun.writing.length),
(properNoun) => OrderingTerm.asc(properNoun.reading.length),
])
..limit(500))
.get();
}
}
}
}
8 changes: 8 additions & 0 deletions lib/src/dao/proper_nouns_dao.g.dart

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit f36c064

Please sign in to comment.