Skip to content

Commit

Permalink
Use SNIK meta ontology as test data as it uses much less space than S…
Browse files Browse the repository at this point in the history
…WDF. Resolves #1.
  • Loading branch information
KonradHoeffner committed Nov 14, 2022
1 parent 30d0c5e commit bece6c2
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 73 deletions.
42 changes: 18 additions & 24 deletions src/dict/dict_sect_pfc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ impl DictSectPFC {

// translated from Java
// https://github.com/rdfhdt/hdt-java/blob/master/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java
// 0 means not found
pub fn locate(&self, element: &str) -> usize {
// binary search
let mut low: usize = 0;
Expand All @@ -61,9 +62,13 @@ impl DictSectPFC {
cmp = element.cmp(text);
//println!("mid: {} text: {} cmp: {:?}", mid, text, cmp);
}

match cmp {
Ordering::Less => high = mid - 1,
Ordering::Less => {
if (mid == 0) {
return 0;
}
high = mid - 1
}
Ordering::Greater => low = mid + 1,
Ordering::Equal => {
return (mid * self.block_size) + 1;
Expand All @@ -73,7 +78,6 @@ impl DictSectPFC {
if high < mid {
mid = high;
}
//println!("block {} but not first", mid);
let idblock = self.locate_in_block(mid, element);
if idblock == 0 {
return 0;
Expand Down Expand Up @@ -266,7 +270,7 @@ mod tests {

#[test]
fn test_section_read() {
let file = File::open("tests/resources/swdf.hdt").expect("error opening file");
let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
let mut reader = BufReader::new(file);
ControlInfo::read(&mut reader).unwrap();
Header::read(&mut reader).unwrap();
Expand All @@ -286,17 +290,13 @@ mod tests {

let shared = DictSectPFC::read(&mut reader).unwrap();
// the file contains IRIs that are used both as subject and object 23128
assert_eq!(shared.num_strings, 23128);
assert_eq!(shared.packed_length, 396479);
assert_eq!(shared.block_size, 8);
for term in [
"http://ymatsuo.com/", "_:b5", "_:b1", "_:b10", "_:b6", "http://www.uni-koblenz.de/~sschenk",
"http://www-sop.inria.fr/acacia/personnel/Fabien.Gandon/",
] {
assert_eq!(shared.num_strings, 43);
assert_eq!(shared.packed_length, 614);
assert_eq!(shared.block_size, 16);
for term in ["http://www.snik.eu/ontology/meta/Top", "http://www.snik.eu/ontology/meta/Function", "_:b1"] {
let id = shared.locate(term);
let back = shared.extract(id);
println!("{} -> {} -> {}", term, id, back);
assert_eq!(term, back);
assert_eq!(term, back, "term does not translate back to itself {} -> {} -> {}", term, id, back);
}
let sequence = shared.sequence;
let data_size = (sequence.bits_per_entry * sequence.entries + 63) / 64;
Expand All @@ -311,21 +311,15 @@ mod tests {
}

let subjects = DictSectPFC::read(&mut reader).unwrap();
//println!("{}", subjects.num_strings);
assert_eq!(subjects.num_strings, 182);
assert_eq!(subjects.num_strings, 5);
for term in [
"http://www.eswc2006.org/topics/#topic2.7.8", "http://xmlns.com/foaf/0.1/",
"http://www.eswc2006.org/topics/#topic3.0", "http://www.eswc2006.org/topics/#topic3.2",
"http://www.eswc2006.org/topics/#topic3.4", "http://www.eswc2006.org/topics/#topic3.5",
"http://www.eswc2006.org/topics/#topic3.6", "http://www.eswc2006.org/topics/#topic3.7",
"http://www.eswc2006.org/topics/#topic3.8", "http://sdow2008.semanticweb.org/#cfp",
"file:///copiaotros/rdf/datasets/SWDF/28-11-2012/data.semanticweb.org/dumps/conferences/authors",
"file:///copiaotros/rdf/datasets/SWDF/28-11-2012/data.semanticweb.org/dumps/conferences/demos",
"http://www.snik.eu/ontology/meta", "http://www.snik.eu/ontology/meta/feature",
"http://www.snik.eu/ontology/meta/homonym", "http://www.snik.eu/ontology/meta/master",
"http://www.snik.eu/ontology/meta/typicalFeature",
] {
let id = subjects.locate(term);
let back = subjects.extract(id);
println!("{} -> {} -> {}", term, id, back);
assert_eq!(term, back);
assert_eq!(term, back, "term does not translate back to itself {} -> {} -> {}", term, id, back);
}
let sequence = subjects.sequence;
let data_size = (sequence.bits_per_entry * sequence.entries + 63) / 64;
Expand Down
19 changes: 11 additions & 8 deletions src/dict/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,30 +109,33 @@ mod tests {

#[test]
fn read_dict() {
let file = File::open("tests/resources/swdf.hdt").expect("error opening file");
let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
let mut reader = BufReader::new(file);
ControlInfo::read(&mut reader).unwrap();
Header::read(&mut reader).unwrap();
match Dict::read(&mut reader).unwrap() {
Dict::FourSectDict(dict) => {
assert_eq!("_:b1", dict.id_to_string(1, IdKind::Subject));
assert_eq!("_:b10", dict.id_to_string(2, IdKind::Subject));
assert_eq!("_:b11", dict.id_to_string(3, IdKind::Subject));
assert_eq!("http://ymatsuo.com/", dict.id_to_string(23128, IdKind::Subject));
assert_eq!(
"http://www.snik.eu/ontology/meta/ApplicationComponent",
dict.id_to_string(2, IdKind::Subject)
);
assert_eq!("http://www.snik.eu/ontology/meta/Chapter", dict.id_to_string(3, IdKind::Subject));
assert_eq!("http://www.snik.eu/ontology/meta/DataSetType", dict.id_to_string(5, IdKind::Subject));
match dict.shared {
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 23128),
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 43),
};

match dict.subjects {
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 182),
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 5),
};

match dict.predicates {
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 170),
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 23),
};

match dict.objects {
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 53401),
DictSect::PFC(sect) => assert_eq!(sect.num_strings(), 132),
};
}
};
Expand Down
16 changes: 2 additions & 14 deletions src/hdt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,10 @@ mod tests {

#[test]
fn triples() {
let file = File::open("tests/resources/swdf.hdt").expect("error opening file");
//let file = File::open("tests/resources/snik.hdt").expect("error opening file");
// let file = File::open("data/wordnet.hdt").expect("error opening file");
//let file = File::open("tests/resources/qbench2.hdt").expect("error opening file");
//let file = File::open("tests/resources/lscomplete20143.hdt").expect("error opening file");
let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
let hdt = Hdt::new(std::io::BufReader::new(file)).unwrap();
let mut triples = hdt.triples();
let v: Vec<(String, String, String)> = triples.collect();
assert_eq!(v.len(), 242256);
//assert_eq!(v.len(), 42742);
//println!("{:?}",triples.iter().filter(|(s,p,o)| s == "<http://ymatsuo.com/>"));
//<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
let sample = &v[0..8];
println!("triples {:#?}", sample);
let tws = hdt.triples_with(IdKind::Subject,"http://ymatsuo.com/");
let twsv: Vec<(String, String, String)> = tws.collect();
println!("{:?}", twsv);
assert_eq!(v.len(), 327);
}
}
2 changes: 1 addition & 1 deletion src/hdt_graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ mod tests {

#[test]
fn test_graph() {
let file = File::open("tests/resources/swdf.hdt").expect("error opening file");
let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
//let file = File::open("tests/resources/snik.hdt").expect("error opening file");
//let file = File::open("tests/resources/lscomplete20143.hdt").expect("error opening file");
let hdt = Hdt::new(std::io::BufReader::new(file)).unwrap();
Expand Down
43 changes: 17 additions & 26 deletions src/triples.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,24 +341,21 @@ mod tests {

#[test]
fn read_triples() {
let file = File::open("tests/resources/swdf.hdt").expect("error opening file");
let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
let mut reader = BufReader::new(file);
ControlInfo::read(&mut reader).unwrap();
let _header = Header::read(&mut reader).unwrap();
Dict::read(&mut reader).unwrap();
Header::read(&mut reader).unwrap();
let dict = Dict::read(&mut reader).unwrap();
let triples = TripleSect::read(&mut reader).unwrap();
let v: Vec<TripleId> = triples.read_all_ids().into_iter().collect::<Vec<TripleId>>();
assert_eq!(v.len(), 242256);
assert_eq!(v.len(), 327);
//println!("{:#?}", &v[0..30]);
assert_eq!(v[0].subject_id, 1);
assert_eq!(v[2].subject_id, 1);
assert_eq!(v[3].subject_id, 2);
//for i in 1..200 {println!("{:?}",(&v).into_iter().filter(|tid| tid.object_id == i).collect::<Vec<&TripleId>>());}
let triples_with_s = [
vec![(1, 90, 13304), (1, 101, 19384), (1, 111, 75817)],
vec![(5, 90, 13017), (5, 101, 14748), (5, 111, 75817)],
vec![(7, 90, 15802), (7, 101, 15758), (7, 104, 17490), (7, 105, 18547), (7, 111, 75817)],
];
//for i in 1..1 {println!("{:?}",(&v).into_iter().filter(|tid| tid.object_id == i).collect::<Vec<&TripleId>>());}
let triples_with_s =
[vec![(1, 11, 172), (1, 18, 9), (1, 20, 43)], vec![(2, 11, 168), (2, 14, 107), (2, 16, 6)]];
// theorectially order doesn't matter so should derive Hash for TripleId and use HashSet but not needed in practice
for ts in triples_with_s {
assert_eq!(
Expand All @@ -367,25 +364,19 @@ mod tests {
);
}

let triples_with_o = [
vec![(7077, 129, 162), (12288, 150, 162), (23261, 18, 162)],
vec![(7088, 129, 184), (19818, 18, 184)],
vec![(1364, 14, 193)],
];
let triples_with_o = [vec![(10, 16, 1)], vec![(44, 1, 5)], vec![(1, 18, 9), (44, 1, 9)]];
for to in triples_with_o {
let tids = to.clone().into_iter().map(|(x, y, z)| TripleId::new(x, y, z)).collect::<Vec<TripleId>>();
//println!("{:?}", tids);
assert_eq!(tids, triples.triples_with_o(to[0].2).collect::<Vec<TripleId>>());
let ex = to.clone().into_iter().map(|(x, y, z)| TripleId::new(x, y, z)).collect::<Vec<TripleId>>();
let rec: Vec<TripleId> = triples.triples_with_o(to[0].2).collect();
assert_eq!(ex, rec, "ex {:?} rec {:?}", dict.translate_all_ids(&ex), dict.translate_all_ids(&rec));
}

//for i in 1..150 {println!("{:?}", (&v).into_iter().filter(|tid| tid.predicate_id == i).collect::<Vec<&TripleId>>());}
println!("{:?}", (&v).into_iter().filter(|tid| tid.predicate_id == 4).collect::<Vec<&TripleId>>());

let triples_with_p = [vec![(3232, 4, 3233), (3545, 4, 3643), (3642, 4, 3643), (6551, 4, 67719)]];
for to in triples_with_p {
let tids = to.clone().into_iter().map(|(x, y, z)| TripleId::new(x, y, z)).collect::<Vec<TripleId>>();
//println!("{:?}", tids);
assert_eq!(tids, triples.triples_with_p(to[0].1).collect::<Vec<TripleId>>());
//for i in 2..5 {println!("{:?}", (&v).into_iter().filter(|tid| tid.predicate_id == i).collect::<Vec<&TripleId>>());}
let triples_with_p = [vec![(44, 2, 64), (44, 2, 78)], vec![(44, 4, 175)]];
for tp in triples_with_p {
let ex = tp.clone().into_iter().map(|(x, y, z)| TripleId::new(x, y, z)).collect::<Vec<TripleId>>();
let rec: Vec<TripleId> = triples.triples_with_p(tp[0].1).collect();
assert_eq!(ex, rec, "ex {:?} rec {:?}", dict.translate_all_ids(&ex), dict.translate_all_ids(&rec));
}
}
}
Binary file added tests/resources/snikmeta.hdt
Binary file not shown.
Binary file removed tests/resources/swdf.hdt
Binary file not shown.

0 comments on commit bece6c2

Please sign in to comment.