Skip to content

Commit

Permalink
Make Faker deterministic
Browse files Browse the repository at this point in the history
  • Loading branch information
nineinchnick authored and ebyhr committed Nov 6, 2024
1 parent 55e1deb commit 80ad6c4
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 20 deletions.
3 changes: 0 additions & 3 deletions docs/src/main/sphinx/connector/faker.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,4 @@ CREATE TABLE generator.default.customer (

## Limitations

- Generated data is not deterministic. There is no way to specify a seed for
the random generator. The same query reading from catalogs using this
connector, executed multiple times, returns different results each time.
- It is not possible to choose the locale used by the Datafaker's generators.
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,27 @@

import java.util.List;
import java.util.Random;
import java.util.random.RandomGeneratorFactory;

import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.util.random.RandomGenerator.JumpableGenerator;

public class FakerPageSourceProvider
implements ConnectorPageSourceProvider
{
private final Random random;
private final JumpableGenerator jumpableRandom;
private final Faker faker;

@Inject
public FakerPageSourceProvider()
{
random = new Random();
faker = new Faker(random);
// Every split should generate data in a sequence that does not overlap with other splits.
// To make data generation deterministic, use a generator with the same seed,
// but advance its state by a different offset for every split.
// A jumpable random generator's state can be advanced forward by a big distance in a single call.
// Xoroshiro128PlusPlus state has a period of 2^128, and a jump distance of 2^64.
jumpableRandom = (JumpableGenerator) RandomGeneratorFactory.of("Xoroshiro128PlusPlus").create(1);
faker = new Faker(Random.from(jumpableRandom.copy()));
}

@Override
Expand All @@ -58,7 +65,17 @@ public ConnectorPageSource createPageSource(

FakerTableHandle fakerTable = (FakerTableHandle) table;
FakerSplit fakerSplit = (FakerSplit) split;
return new FakerPageSource(faker, random, handles, fakerTable.constraint(), fakerSplit.limit());
Random random = random(fakerSplit.splitNumber());
return new FakerPageSource(new Faker(random), random, handles, fakerTable.constraint(), fakerSplit.limit());
}

private Random random(long index)
{
JumpableGenerator jumpableRandom = this.jumpableRandom.copy();
for (long i = 0; i < index; i++) {
jumpableRandom.jump();
}
return Random.from(jumpableRandom);
}

public void validateGenerator(String generator)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import static java.util.Objects.requireNonNull;

public record FakerSplit(List<HostAddress> addresses, long limit)
public record FakerSplit(List<HostAddress> addresses, long splitNumber, long limit)
implements ConnectorSplit
{
public FakerSplit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ public ConnectorSplitSource getSplits(
ImmutableList.Builder<ConnectorSplit> splits = ImmutableList.builder();
for (long i = 0; i < splitCount - 1; i++) {
HostAddress address = addresses.get((int) (i % addresses.size()));
splits.add(new FakerSplit(ImmutableList.of(address), MAX_ROWS_PER_SPLIT));
splits.add(new FakerSplit(ImmutableList.of(address), i, MAX_ROWS_PER_SPLIT));
}
HostAddress address = addresses.get((int) ((splitCount - 1) % addresses.size()));
long limit = fakerTable.limit() % MAX_ROWS_PER_SPLIT;
splits.add(new FakerSplit(ImmutableList.of(address), limit == 0 ? MAX_ROWS_PER_SPLIT : limit));
splits.add(new FakerSplit(ImmutableList.of(address), splitCount - 1, limit == 0 ? MAX_ROWS_PER_SPLIT : limit));
return new FixedSplitSource(splits.build());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -186,19 +186,39 @@ rnd_nchar char(1000) NOT NULL,
@Test
void testSelectLimit()
{
@Language("SQL")
String tableQuery = "CREATE TABLE faker.default.single_column (rnd_bigint bigint NOT NULL)";
assertUpdate(tableQuery);
assertUpdate("CREATE TABLE faker.default.single_column (rnd_bigint bigint NOT NULL)");

@Language("SQL")
String testQuery = "SELECT count(rnd_bigint) FROM (SELECT rnd_bigint FROM single_column LIMIT 5) a";
assertQuery(testQuery, "VALUES (5)");
assertQuery("SELECT count(rnd_bigint) FROM (SELECT rnd_bigint FROM single_column LIMIT 5) a",
"VALUES (5)");

testQuery = "SELECT count(rnd_bigint) FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a".formatted(2*MAX_ROWS_PER_SPLIT);
assertQuery(testQuery, "VALUES (%d)".formatted(2*MAX_ROWS_PER_SPLIT));
assertQuery("""
SELECT count(rnd_bigint)
FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a""".formatted(2 * MAX_ROWS_PER_SPLIT),
"VALUES (%d)".formatted(2 * MAX_ROWS_PER_SPLIT));

assertQuery("SELECT count(distinct rnd_bigint) FROM single_column LIMIT 5",
"VALUES (1000)");

assertQuery("""
SELECT count(rnd_bigint)
FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a""".formatted(MAX_ROWS_PER_SPLIT),
"VALUES (%d)".formatted(MAX_ROWS_PER_SPLIT));

// generating data should be deterministic
String testQuery = """
SELECT to_hex(checksum(rnd_bigint))
FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a""".formatted(3 * MAX_ROWS_PER_SPLIT);
assertQuery(testQuery, "VALUES ('1FB3289AC3A44EEA')");
assertQuery(testQuery, "VALUES ('1FB3289AC3A44EEA')");
assertQuery(testQuery, "VALUES ('1FB3289AC3A44EEA')");

// there should be no overlap between data generated from different splits
assertQuery("""
SELECT count(1)
FROM (SELECT rnd_bigint FROM single_column LIMIT %d) a
JOIN (SELECT rnd_bigint FROM single_column LIMIT %d) b ON a.rnd_bigint = b.rnd_bigint""".formatted(2 * MAX_ROWS_PER_SPLIT, 5 * MAX_ROWS_PER_SPLIT),
"VALUES (%d)".formatted(2 * MAX_ROWS_PER_SPLIT));

testQuery = "SELECT count(distinct rnd_bigint) FROM single_column LIMIT 5";
assertQuery(testQuery, "VALUES (1000)");
assertUpdate("DROP TABLE faker.default.single_column");
}

Expand Down

0 comments on commit 80ad6c4

Please sign in to comment.