Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support SPECIES_128 #41

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,19 @@ tasks.register('downloadTestData') {
}
}

tasks.register('test128', Test) {
dependsOn downloadTestData
useJUnitPlatform()
jvmArgs += [
'--add-modules', 'jdk.incubator.vector',
'-Xmx2g',
'-Dorg.simdjson.species=128'
]
testLogging {
events 'PASSED', 'SKIPPED', 'FAILED', 'STANDARD_OUT', 'STANDARD_ERROR'
}
}

tasks.register('test256', Test) {
dependsOn downloadTestData
useJUnitPlatform()
Expand Down Expand Up @@ -103,6 +116,7 @@ tasks.register('test512', Test) {
}

test {
dependsOn 'test128'
dependsOn 'test256'
dependsOn 'test512'
}
Expand Down
127 changes: 95 additions & 32 deletions src/main/java/org/simdjson/CharactersClassifier.java
Original file line number Diff line number Diff line change
@@ -1,66 +1,129 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.VectorShuffle;

class CharactersClassifier {

private static final byte LOW_NIBBLE_MASK = 0x0f;

private static final ByteVector WHITESPACE_TABLE =
private static final ByteVector WHITESPACE_TABLE_128 =
ByteVector.fromArray(
ByteVector.SPECIES_128,
new byte[]{
' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
},
0);
private static final ByteVector WHITESPACE_TABLE_256 =
ByteVector.fromArray(
ByteVector.SPECIES_256,
new byte[]{
' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
},
0);
private static final ByteVector WHITESPACE_TABLE_512 =
ByteVector.fromArray(
StructuralIndexer.BYTE_SPECIES,
repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
ByteVector.SPECIES_512,
new byte[]{
' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100,
},
0);

private static final ByteVector OP_TABLE =
private static final ByteVector OP_TABLE_128 =
ByteVector.fromArray(
ByteVector.SPECIES_128,
new byte[]{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0,
},
0);
private static final ByteVector OP_TABLE_256 =
ByteVector.fromArray(
ByteVector.SPECIES_256,
new byte[]{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0,
},
0);
private static final ByteVector OP_TABLE_512 =
ByteVector.fromArray(
StructuralIndexer.BYTE_SPECIES,
repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
ByteVector.SPECIES_512,
new byte[]{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0,
},
0);

private static byte[] repeat(byte[] array, int n) {
byte[] result = new byte[n * array.length];
for (int dst = 0; dst < result.length; dst += array.length) {
System.arraycopy(array, 0, result, dst, array.length);
}
return result;
static JsonCharacterBlock classify(ByteVector chunk0) {
var chunk0Low = extractLowNibble(chunk0).toShuffle();
var whitespace = eq(chunk0, WHITESPACE_TABLE_512.rearrange(chunk0Low));
var curlified0 = curlify(chunk0);
var op = eq(curlified0, OP_TABLE_512.rearrange(chunk0Low));
return new JsonCharacterBlock(whitespace, op);
}

JsonCharacterBlock classify(ByteVector chunk0) {
VectorShuffle<Byte> chunk0Low = extractLowNibble(chunk0).toShuffle();
long whitespace = eq(chunk0, WHITESPACE_TABLE.rearrange(chunk0Low));
ByteVector curlified0 = curlify(chunk0);
long op = eq(curlified0, OP_TABLE.rearrange(chunk0Low));
static JsonCharacterBlock classify(ByteVector chunk0, ByteVector chunk1) {
var chunk0Low = extractLowNibble(chunk0).toShuffle();
var chunk1Low = extractLowNibble(chunk1).toShuffle();
var whitespace = eq(chunk0, WHITESPACE_TABLE_256.rearrange(chunk0Low), chunk1, WHITESPACE_TABLE_256.rearrange(chunk1Low));
var curlified0 = curlify(chunk0);
var curlified1 = curlify(chunk1);
var op = eq(curlified0, OP_TABLE_256.rearrange(chunk0Low), curlified1, OP_TABLE_256.rearrange(chunk1Low));
return new JsonCharacterBlock(whitespace, op);
}

JsonCharacterBlock classify(ByteVector chunk0, ByteVector chunk1) {
VectorShuffle<Byte> chunk0Low = extractLowNibble(chunk0).toShuffle();
VectorShuffle<Byte> chunk1Low = extractLowNibble(chunk1).toShuffle();
long whitespace = eq(chunk0, WHITESPACE_TABLE.rearrange(chunk0Low), chunk1, WHITESPACE_TABLE.rearrange(chunk1Low));
ByteVector curlified0 = curlify(chunk0);
ByteVector curlified1 = curlify(chunk1);
long op = eq(curlified0, OP_TABLE.rearrange(chunk0Low), curlified1, OP_TABLE.rearrange(chunk1Low));
static JsonCharacterBlock classify(ByteVector chunk0, ByteVector chunk1, ByteVector chunk2, ByteVector chunk3) {
var chunk0Low = extractLowNibble(chunk0).toShuffle();
var chunk1Low = extractLowNibble(chunk1).toShuffle();
var chunk2Low = extractLowNibble(chunk2).toShuffle();
var chunk3Low = extractLowNibble(chunk3).toShuffle();
var whitespace = eq(
chunk0, WHITESPACE_TABLE_128.rearrange(chunk0Low),
chunk1, WHITESPACE_TABLE_128.rearrange(chunk1Low),
chunk2, WHITESPACE_TABLE_128.rearrange(chunk2Low),
chunk3, WHITESPACE_TABLE_128.rearrange(chunk3Low)
);
var curlified0 = curlify(chunk0);
var curlified1 = curlify(chunk1);
var curlified2 = curlify(chunk2);
var curlified3 = curlify(chunk3);
var op = eq(
curlified0, OP_TABLE_128.rearrange(chunk0Low),
curlified1, OP_TABLE_128.rearrange(chunk1Low),
curlified2, OP_TABLE_128.rearrange(chunk2Low),
curlified3, OP_TABLE_128.rearrange(chunk3Low)
);
return new JsonCharacterBlock(whitespace, op);
}

private ByteVector extractLowNibble(ByteVector vector) {
private static ByteVector extractLowNibble(ByteVector vector) {
return vector.and(LOW_NIBBLE_MASK);
}

private ByteVector curlify(ByteVector vector) {
private static ByteVector curlify(ByteVector vector) {
// turns [ into { and ] into }
return vector.or((byte) 0x20);
}

private long eq(ByteVector chunk0, ByteVector mask0) {
private static long eq(ByteVector chunk0, ByteVector mask0) {
return chunk0.eq(mask0).toLong();
}
}

private long eq(ByteVector chunk0, ByteVector mask0, ByteVector chunk1, ByteVector mask1) {
private static long eq(ByteVector chunk0, ByteVector mask0, ByteVector chunk1, ByteVector mask1) {
long r0 = chunk0.eq(mask0).toLong();
long r1 = chunk1.eq(mask1).toLong();
return r0 | (r1 << 32);
}
}

private static long eq(ByteVector chunk0, ByteVector mask0, ByteVector chunk1, ByteVector mask1, ByteVector chunk2, ByteVector mask2, ByteVector chunk3, ByteVector mask3) {
long r0 = chunk0.eq(mask0).toLong();
long r1 = chunk1.eq(mask1).toLong();
long r2 = chunk2.eq(mask2).toLong();
long r3 = chunk3.eq(mask3).toLong();
return (r0 & 0xFFFFL) | ((r1 & 0xFFFFL) << 16) | ((r2 & 0xFFFFL) << 32) | ((r3 & 0xFFFFL) << 48);
}
}

59 changes: 44 additions & 15 deletions src/main/java/org/simdjson/JsonStringScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,78 @@
import jdk.incubator.vector.ByteVector;

class JsonStringScanner {

private static final long EVEN_BITS_MASK = 0x5555555555555555L;
private static final long ODD_BITS_MASK = ~EVEN_BITS_MASK;

private final ByteVector backslashMask;
private final ByteVector quoteMask;
private static final ByteVector BACKSLASH_MASK_128 = ByteVector.broadcast(ByteVector.SPECIES_128, (byte) '\\');
private static final ByteVector QUOTE_MASK_128 = ByteVector.broadcast(ByteVector.SPECIES_128, (byte) '"');

private static final ByteVector BACKSLASH_MASK_256 = ByteVector.broadcast(ByteVector.SPECIES_256, (byte) '\\');
private static final ByteVector QUOTE_MASK_256 = ByteVector.broadcast(ByteVector.SPECIES_256, (byte) '"');

private static final ByteVector BACKSLASH_MASK_512 = ByteVector.broadcast(ByteVector.SPECIES_512, (byte) '\\');
private static final ByteVector QUOTE_MASK_512 = ByteVector.broadcast(ByteVector.SPECIES_512, (byte) '"');


private long prevInString = 0;
private long prevEscaped = 0;

JsonStringScanner() {
this.backslashMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '\\');
this.quoteMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '"');
}

JsonStringBlock next(ByteVector chunk0) {
long backslash = eq(chunk0, backslashMask);
long escaped = findEscaped(backslash);
long quote = eq(chunk0, quoteMask) & ~escaped;
long inString = prefixXor(quote) ^ prevInString;
assert chunk0.species() == ByteVector.SPECIES_512;
var backslash = eq(chunk0, BACKSLASH_MASK_512);
var escaped = findEscaped(backslash);
var quote = eq(chunk0, QUOTE_MASK_512) & ~escaped;
var inString = prefixXor(quote) ^ prevInString;
prevInString = inString >> 63;
return new JsonStringBlock(quote, inString);
}

JsonStringBlock next(ByteVector chunk0, ByteVector chunk1) {
long backslash = eq(chunk0, chunk1, backslashMask);
long escaped = findEscaped(backslash);
long quote = eq(chunk0, chunk1, quoteMask) & ~escaped;
long inString = prefixXor(quote) ^ prevInString;
assert chunk0.species() == ByteVector.SPECIES_256;
assert chunk1.species() == ByteVector.SPECIES_256;
var backslash = eq(chunk0, chunk1, BACKSLASH_MASK_256);
var escaped = findEscaped(backslash);
var quote = eq(chunk0, chunk1, QUOTE_MASK_256) & ~escaped;
var inString = prefixXor(quote) ^ prevInString;
prevInString = inString >> 63;
return new JsonStringBlock(quote, inString);
}

private long eq(ByteVector chunk0, ByteVector mask) {
JsonStringBlock next(ByteVector chunk0, ByteVector chunk1, ByteVector chunk2, ByteVector chunk3) {
assert chunk0.species() == ByteVector.SPECIES_128;
assert chunk1.species() == ByteVector.SPECIES_128;
assert chunk2.species() == ByteVector.SPECIES_128;
assert chunk3.species() == ByteVector.SPECIES_128;
var backslash = eq(chunk0, chunk1, chunk2, chunk3, BACKSLASH_MASK_128);
var escaped = findEscaped(backslash);
var quote = eq(chunk0, chunk1, chunk2, chunk3, QUOTE_MASK_128) & ~escaped;
var inString = prefixXor(quote) ^ prevInString;
prevInString = inString >> 63;
return new JsonStringBlock(quote, inString);
}

private static long eq(ByteVector chunk0, ByteVector mask) {
long r = chunk0.eq(mask).toLong();
return r;
}

private long eq(ByteVector chunk0, ByteVector chunk1, ByteVector mask) {
private static long eq(ByteVector chunk0, ByteVector chunk1, ByteVector mask) {
long r0 = chunk0.eq(mask).toLong();
long r1 = chunk1.eq(mask).toLong();
return r0 | (r1 << 32);
}

private static long eq(ByteVector chunk0, ByteVector chunk1, ByteVector chunk2, ByteVector chunk3, ByteVector mask) {
long r0 = chunk0.eq(mask).toLong();
long r1 = chunk1.eq(mask).toLong();
long r2 = chunk2.eq(mask).toLong();
long r3 = chunk3.eq(mask).toLong();
return (r0 & 0xFFFFL) | ((r1 & 0xFFFFL) << 16) | ((r2 & 0xFFFFL) << 32) | ((r3 & 0xFFFFL) << 48);
}

private long findEscaped(long backslash) {
if (backslash == 0) {
long escaped = prevEscaped;
Expand Down Expand Up @@ -88,3 +116,4 @@ void finish() {
}
}
}

32 changes: 27 additions & 5 deletions src/main/java/org/simdjson/StructuralIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class StructuralIndexer {
BYTE_SPECIES = ByteVector.SPECIES_256;
INT_SPECIES = IntVector.SPECIES_256;
}
case "128" -> {
BYTE_SPECIES = ByteVector.SPECIES_128;
INT_SPECIES = IntVector.SPECIES_128;
}
default -> throw new IllegalArgumentException("Unsupported vector species: " + species);
}
N_CHUNKS = 64 / BYTE_SPECIES.vectorByteSize();
Expand All @@ -36,13 +40,12 @@ class StructuralIndexer {
}

private static void assertSupportForSpecies(VectorSpecies<?> species) {
if (species.vectorShape() != VectorShape.S_256_BIT && species.vectorShape() != VectorShape.S_512_BIT) {
if (species.vectorShape() != VectorShape.S_128_BIT && species.vectorShape() != VectorShape.S_256_BIT && species.vectorShape() != VectorShape.S_512_BIT) {
throw new IllegalArgumentException("Unsupported vector species: " + species);
}
}

private final JsonStringScanner stringScanner;
private final CharactersClassifier classifier;
private final BitIndexes bitIndexes;

private long prevStructurals = 0;
Expand All @@ -51,22 +54,22 @@ private static void assertSupportForSpecies(VectorSpecies<?> species) {

StructuralIndexer(BitIndexes bitIndexes) {
this.stringScanner = new JsonStringScanner();
this.classifier = new CharactersClassifier();
this.bitIndexes = bitIndexes;
}

void step(byte[] buffer, int offset, int blockIndex) {
switch (N_CHUNKS) {
case 1: step1(buffer, offset, blockIndex); break;
case 2: step2(buffer, offset, blockIndex); break;
case 4: step4(buffer, offset, blockIndex); break;
default: throw new RuntimeException("Unsupported vector width: " + N_CHUNKS * 64);
}
}

private void step1(byte[] buffer, int offset, int blockIndex) {
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset);
JsonStringBlock strings = stringScanner.next(chunk0);
JsonCharacterBlock characters = classifier.classify(chunk0);
JsonCharacterBlock characters = CharactersClassifier.classify(chunk0);
long unescaped = lteq(chunk0, (byte) 0x1F);
finishStep(characters, strings, unescaped, blockIndex);
}
Expand All @@ -75,11 +78,22 @@ private void step2(byte[] buffer, int offset, int blockIndex) {
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_256, buffer, offset);
ByteVector chunk1 = ByteVector.fromArray(ByteVector.SPECIES_256, buffer, offset + 32);
JsonStringBlock strings = stringScanner.next(chunk0, chunk1);
JsonCharacterBlock characters = classifier.classify(chunk0, chunk1);
JsonCharacterBlock characters = CharactersClassifier.classify(chunk0, chunk1);
long unescaped = lteq(chunk0, chunk1, (byte) 0x1F);
finishStep(characters, strings, unescaped, blockIndex);
}

private void step4(byte[] buffer, int offset, int blockIndex) {
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_128, buffer, offset);
ByteVector chunk1 = ByteVector.fromArray(ByteVector.SPECIES_128, buffer, offset + 16);
ByteVector chunk2 = ByteVector.fromArray(ByteVector.SPECIES_128, buffer, offset + 32);
ByteVector chunk3 = ByteVector.fromArray(ByteVector.SPECIES_128, buffer, offset + 48);
JsonStringBlock strings = stringScanner.next(chunk0, chunk1, chunk2, chunk3);
JsonCharacterBlock characters = CharactersClassifier.classify(chunk0, chunk1, chunk2, chunk3);
long unescaped = lteq(chunk0, chunk1, chunk2, chunk3, (byte) 0x1F);
finishStep(characters, strings, unescaped, blockIndex);
}

private void finishStep(JsonCharacterBlock characters, JsonStringBlock strings, long unescaped, int blockIndex) {
long scalar = characters.scalar();
long nonQuoteScalar = scalar & ~strings.quote();
Expand All @@ -102,6 +116,14 @@ private long lteq(ByteVector chunk0, ByteVector chunk1, byte scalar) {
return r0 | (r1 << 32);
}

private long lteq(ByteVector chunk0, ByteVector chunk1, ByteVector chunk2, ByteVector chunk3, byte scalar) {
long r0 = chunk0.compare(UNSIGNED_LE, scalar).toLong();
long r1 = chunk1.compare(UNSIGNED_LE, scalar).toLong();
long r2 = chunk2.compare(UNSIGNED_LE, scalar).toLong();
long r3 = chunk3.compare(UNSIGNED_LE, scalar).toLong();
return (r0 & 0xFFFFL) | ((r1 & 0xFFFFL) << 16) | ((r2 & 0xFFFFL) << 32) | ((r3 & 0xFFFFL) << 48);
}

void finish(int blockIndex) {
bitIndexes.write(blockIndex, prevStructurals);
bitIndexes.finish();
Expand Down
1 change: 1 addition & 0 deletions src/test/java/org/simdjson/CharactersClassifierTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ private JsonCharacterBlock classify(CharactersClassifier classifier, String str)
return switch (StructuralIndexer.N_CHUNKS) {
case 1 -> classifier.classify(chunk(str, 0));
case 2 -> classifier.classify(chunk(str, 0), chunk(str, 1));
case 4 -> classifier.classify(chunk(str, 0), chunk(str, 1), chunk(str, 2), chunk(str, 3));
default -> throw new RuntimeException("Unsupported chunk count: " + StructuralIndexer.N_CHUNKS);
};
}
Expand Down
1 change: 1 addition & 0 deletions src/test/java/org/simdjson/JsonStringScannerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ private JsonStringBlock next(JsonStringScanner scanner, String str) {
return switch (StructuralIndexer.N_CHUNKS) {
case 1 -> scanner.next(chunk(str, 0));
case 2 -> scanner.next(chunk(str, 0), chunk(str, 1));
case 4 -> scanner.next(chunk(str, 0), chunk(str, 1), chunk(str, 2), chunk(str, 3));
default -> throw new RuntimeException("Unsupported chunk count: " + StructuralIndexer.N_CHUNKS);
};
}
Expand Down
Loading