Skip to content

Commit

Permalink
Improved compression and reduced memory usage from 15x to 13x.
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaGrebnov committed Jan 6, 2022
1 parent 0f9a69e commit 0b4dc3c
Show file tree
Hide file tree
Showing 8 changed files with 1,084 additions and 210 deletions.
9 changes: 9 additions & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- Authors of bsc-m03

Ilya Grebnov <Ilya.Grebnov@gmail.com>

-- This program is based on (at least) the work of

Michael Maniscalco, Atsushi Komiya, Pochang Chen and Surya Kandau.


4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
* 2022-01-05 : Version 0.2
* Improved compression.
* Reduced memory usage from 15x to 13x.

* 2021-12-07 : Version 0.1.1 - 0.1.2
* Slightly improved compression using symbols history.

Expand Down
129 changes: 66 additions & 63 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ The bsc-m03 is experimental block sorting compressor based on M03 context aware
* Michael Maniscalco *M03: A solution for context based blocksort (BWT) compression*, 2004
* Jurgen Abel *Post BWT stages of the Burrows-Wheeler compression algorithm*, 2010

Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
Copyright (c) 2021-2022 Ilya Grebnov <ilya.grebnov@gmail.com>

## License
The libsais is released under the [GNU General Public License](LICENSE "GNU General Public License")

## Changes
* 2022-01-05 : Version 0.2
* Improved compression.
* Reduced memory usage from 15x to 13x.
* 2021-12-07 : Version 0.1.1 - 0.1.2
* Slightly improved compression using symbols history.
* 2021-12-03 : Version 0.1.0
Expand All @@ -20,89 +23,89 @@ The libsais is released under the [GNU General Public License](LICENSE "GNU Gene
### Calgary Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bib | 111261 | 25090 | 1.804 |
| book1 | 768771 | 207896 | 2.163 |
| book2 | 610856 | 141204 | 1.849 |
| geo | 102400 | 52821 | 4.127 |
| news | 377109 | 107940 | 2.290 |
| obj1 | 21504 | 9903 | 3.684 |
| obj2 | 246814 | 69338 | 2.247 |
| paper1 | 53161 | 15327 | 2.307 |
| paper2 | 82199 | 23090 | 2.247 |
| pic | 513216 | 44960 | 0.701 |
| progc | 39611 | 11522 | 2.327 |
| progl | 71646 | 13886 | 1.551 |
| progp | 49379 | 9512 | 1.541 |
| trans | 93695 | 15738 | 1.344 |
| bib | 111261 | 24832 | 1.785 |
| book1 | 768771 | 206247 | 2.146 |
| book2 | 610856 | 140103 | 1.835 |
| geo | 102400 | 52597 | 4.109 |
| news | 377109 | 107049 | 2.271 |
| obj1 | 21504 | 9863 | 3.669 |
| obj2 | 246814 | 68833 | 2.231 |
| paper1 | 53161 | 15145 | 2.279 |
| paper2 | 82199 | 22824 | 2.221 |
| pic | 513216 | 44694 | 0.697 |
| progc | 39611 | 11390 | 2.300 |
| progl | 71646 | 13689 | 1.529 |
| progp | 49379 | 9376 | 1.519 |
| trans | 93695 | 15550 | 1.328 |

### Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| alice29.txt | 152089 | 39239 | 2.064 |
| asyoulik.txt | 125179 | 36500 | 2.333 |
| cp.html | 24603 | 7045 | 2.291 |
| fields.c | 11150 | 2751 | 1.974 |
| grammar.lsp | 3721 | 1146 | 2.464 |
| kennedy.xls | 1029744 | 58981 | 0.458 |
| lcet10.txt | 426754 | 96489 | 1.809 |
| plrabn12.txt | 481861 | 131455 | 2.182 |
| ptt5 | 513216 | 44960 | 0.701 |
| sum | 38240 | 11634 | 2.434 |
| xargs.1 | 4227 | 1619 | 3.064 |
| alice29.txt | 152089 | 38841 | 2.043 |
| asyoulik.txt | 125179 | 36149 | 2.310 |
| cp.html | 24603 | 6969 | 2.266 |
| fields.c | 11150 | 2712 | 1.946 |
| grammar.lsp | 3721 | 1138 | 2.447 |
| kennedy.xls | 1029744 | 56929 | 0.442 |
| lcet10.txt | 426754 | 95628 | 1.793 |
| plrabn12.txt | 481861 | 130437 | 2.166 |
| ptt5 | 513216 | 44694 | 0.697 |
| sum | 38240 | 11539 | 2.414 |
| xargs.1 | 4227 | 1603 | 3.034 |

### Large Canterbury Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| bible.txt | 4047392 | 707595 | 1.399 |
| E.coli | 4638690 | 1138016 | 1.963 |
| world192.txt | 2473400 | 383714 | 1.241 |
| bible.txt | 4047392 | 703933 | 1.391 |
| E.coli | 4638690 | 1129304 | 1.948 |
| world192.txt | 2473400 | 381247 | 1.233 |

### Silesia Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| dickens | 10192446 | 2217969 | 1.741 |
| mozilla | 51220480 | 15783932 | 2.465 |
| mr | 9970564 | 2168743 | 1.740 |
| nci | 33553445 | 1147263 | 0.274 |
| ooffice | 6152192 | 2533659 | 3.295 |
| osdb | 10085684 | 2250926 | 1.785 |
| reymont | 6627202 | 969844 | 1.171 |
| samba | 21606400 | 3867735 | 1.432 |
| sao | 7251944 | 4671964 | 5.154 |
| webster | 41458703 | 6308597 | 1.217 |
| xml | 5345280 | 367777 | 0.550 |
| x-ray | 8474240 | 3698602 | 3.492 |
| dickens | 10192446 | 2208219 | 1.733 |
| mozilla | 51220480 | 15704019 | 2.453 |
| mr | 9970564 | 2160359 | 1.733 |
| nci | 33553445 | 1137038 | 0.271 |
| ooffice | 6152192 | 2522972 | 3.281 |
| osdb | 10085684 | 2230920 | 1.770 |
| reymont | 6627202 | 964011 | 1.164 |
| samba | 21606400 | 3839503 | 1.422 |
| sao | 7251944 | 4656134 | 5.136 |
| webster | 41458703 | 6279969 | 1.212 |
| xml | 5345280 | 364952 | 0.546 |
| x-ray | 8474240 | 3685642 | 3.479 |

### Manzini Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| chr22.dna | 34553758 | 7262258 | 1.681 |
| etext99 | 105277340 | 21702753 | 1.649 |
| gcc-3.0.tar | 86630400 | 10262222 | 0.948 |
| howto | 39422105 | 7634423 | 1.549 |
| jdk13c | 69728899 | 2680040 | 0.307 |
| linux-2.4.5.tar | 116254720 | 16698531 | 1.149 |
| rctail96 | 114711151 | 9917087 | 0.692 |
| rfc | 116421901 | 15140037 | 1.040 |
| sprot34.dat | 109617186 | 17470714 | 1.275 |
| w3c2 | 104201579 | 5765329 | 0.443 |
| chr22.dna | 34553758 | 7227116 | 1.673 |
| etext99 | 105277340 | 21586520 | 1.640 |
| gcc-3.0.tar | 86630400 | 10198397 | 0.942 |
| howto | 39422105 | 7594162 | 1.541 |
| jdk13c | 69728899 | 2659297 | 0.305 |
| linux-2.4.5.tar | 116254720 | 16599153 | 1.142 |
| rctail96 | 114711151 | 9852234 | 0.687 |
| rfc | 116421901 | 15047359 | 1.034 |
| sprot34.dat | 109617186 | 17382679 | 1.269 |
| w3c2 | 104201579 | 5717299 | 0.439 |

### Maximum Compression Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| A10.jpg | 842468 | 825194 | 7.836 |
| AcroRd32.exe | 3870784 | 1575980 | 3.257 |
| english.dic | 465211 | 148615 | 2.556 |
| FlashMX.pdf | 4526946 | 3732982 | 6.597 |
| FP.LOG | 20617071 | 513540 | 0.199 |
| MSO97.DLL | 3782416 | 1897216 | 4.013 |
| ohs.doc | 4168192 | 814824 | 1.564 |
| rafale.bmp | 4149414 | 750466 | 1.447 |
| vcfiu.hlp | 4121418 | 617241 | 1.198 |
| world95.txt | 2988578 | 451042 | 1.207 |
| A10.jpg | 842468 | 823856 | 7.823 |
| AcroRd32.exe | 3870784 | 1568677 | 3.242 |
| english.dic | 465211 | 147280 | 2.533 |
| FlashMX.pdf | 4526946 | 3721859 | 6.577 |
| FP.LOG | 20617071 | 508327 | 0.197 |
| MSO97.DLL | 3782416 | 1890558 | 3.999 |
| ohs.doc | 4168192 | 810011 | 1.555 |
| rafale.bmp | 4149414 | 745966 | 1.438 |
| vcfiu.hlp | 4121418 | 613304 | 1.190 |
| world95.txt | 2988578 | 448323 | 1.200 |

### Large Text Compression Benchmark Corpus ###
| File name | Input size (bytes) | Output size (bytes) | Bits per symbol |
|:---------------:|:-----------:|:------------:|:-------:|
| enwik8 | 100000000 | 20486072 | 1.639 |
| enwik9 | 1000000000 | 161794295 | 1.294 |
| enwik8 | 100000000 | 20398312 | 1.632 |
| enwik9 | 1000000000 | 161062976 | 1.289 |
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.2
0.2.0
112 changes: 56 additions & 56 deletions bsc-m03.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This file is a part of bsc-m03 project.
Copyright (c) 2021 Ilya Grebnov <ilya.grebnov@gmail.com>
Copyright (c) 2021-2022 Ilya Grebnov <ilya.grebnov@gmail.com>
bsc-m03 is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -45,19 +45,20 @@ This file is a part of bsc-m03 project.

int32_t root_frequencies[MAX_ALPHABET_SIZE + 1];

static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size, int32_t symbol_size)
template <class symbol_t> static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size)
{
int32_t indexes[32] = { -1 };
int32_t comressed_size = -1;
int32_t symbol_size = (int32_t)sizeof(symbol_t);
int32_t block_symbols = block_size / symbol_size;
int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));

if (block_size % symbol_size != 0)
{
fprintf(stderr, "\nError: Block size of %d bytes is not a multiple of symbol width!\n", block_size);
return -2;
}

int32_t indexes[32] = { -1 };
int32_t comressed_size = -1;
int32_t block_symbols = block_size / symbol_size;
int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));

if (int32_t * libsais_temp = (int32_t *)malloc(block_symbols * sizeof(int32_t)))
{
int32_t result = symbol_size == 1
Expand All @@ -68,23 +69,15 @@ static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size, int32

if (result == 0)
{
if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t)))
if (symbol_t * L = (symbol_t *)malloc(((size_t)block_symbols + 1) * sizeof(symbol_t)))
{
if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser)))
if (m03_parser<symbol_t> * parser = (m03_parser<symbol_t> *)malloc(sizeof(m03_parser<symbol_t>)))
{
{
int32_t primary_index = indexes[0];

if (symbol_size == 1)
{
for (int32_t p = 0; p < primary_index; ++p) { L[p + 0] = ((uint16_t)buffer[p]); }
for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t)buffer[p]); }
}
else
{
for (int32_t p = 0; p < primary_index; ++p) { L[p + 0] = ((uint16_t *)buffer)[p]; }
for (int32_t p = primary_index; p < block_symbols; ++p) { L[p + 1] = ((uint16_t *)buffer)[p]; }
}
memcpy(&L[0] , &((symbol_t *)buffer)[0] , primary_index * sizeof(symbol_t));
memcpy(&L[primary_index + 1], &((symbol_t *)buffer)[primary_index], ((size_t)block_symbols - (size_t)primary_index) * sizeof(symbol_t));

L[primary_index] = 0;
}
Expand All @@ -98,7 +91,7 @@ static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size, int32
coder.EncodeValue(1, indexes[t], block_symbols);
}

if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::encoding))
if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, 1 << (8 * symbol_size), &coder, m03_mode::encoding))
{
parser->run();
parser->destroy();
Expand Down Expand Up @@ -137,45 +130,26 @@ static int32_t compress_memory_block(uint8_t * buffer, int32_t block_size, int32
return comressed_size;
}

static int32_t decompress_memory_block(uint8_t * buffer, int32_t comressed_size, int32_t block_size)
template <class symbol_t> static int32_t decompress_burrows_wheeler_transform(RangeCoder * coder, int32_t primary_index, int32_t block_size, uint8_t * buffer)
{
RangeCoder coder;
coder.InitDecoder(buffer);
int32_t symbol_size = coder.DecodeValue(1, 2);
int32_t result = -1;
int32_t symbol_size = (int32_t)sizeof(symbol_t);
int32_t block_symbols = block_size / symbol_size;

int32_t indexes[32] = { -1 };
int32_t primary_index = -1;
int32_t decomressed_size = -1;
int32_t block_symbols = block_size / symbol_size;
int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));

for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t)
{
indexes[t] = coder.DecodeValue(1, block_symbols);
}

if (uint16_t * L = (uint16_t *)malloc(((size_t)block_symbols + 1) * sizeof(uint16_t)))
if (symbol_t * L = (symbol_t *)malloc(((size_t)block_symbols + 1) * sizeof(symbol_t)))
{
if (m03_parser * parser = (m03_parser *)malloc(sizeof(m03_parser)))
if (m03_parser<symbol_t> * parser = (m03_parser<symbol_t> *)malloc(sizeof(m03_parser<symbol_t>)))
{
if (parser->initialize(L, block_symbols + 1, indexes[0], root_frequencies, symbol_size == 1 ? 256 : 256 * 256, &coder, m03_mode::decoding))
if (parser->initialize(L, block_symbols + 1, primary_index, root_frequencies, 1 << (8 * symbol_size), coder, m03_mode::decoding))
{
parser->run();
parser->destroy();

{
primary_index = indexes[0];
memcpy(&((symbol_t *)buffer)[0] , &L[0] , primary_index * sizeof(symbol_t));
memcpy(&((symbol_t *)buffer)[primary_index], &L[primary_index + 1], ((size_t)block_symbols - (size_t)primary_index) * sizeof(symbol_t));

if (symbol_size == 1)
{
for (int32_t p = 0; p < primary_index; ++p) { buffer[p] = (uint8_t)L[p + 0]; }
for (int32_t p = primary_index; p < block_symbols; ++p) { buffer[p] = (uint8_t)L[p + 1]; }
}
else
{
for (int32_t p = 0; p < primary_index; ++p) { ((uint16_t *)buffer)[p] = L[p + 0]; }
for (int32_t p = primary_index; p < block_symbols; ++p) { ((uint16_t *)buffer)[p] = L[p + 1]; }
}
result = 0;
}
}
else
Expand All @@ -197,11 +171,34 @@ static int32_t decompress_memory_block(uint8_t * buffer, int32_t comressed_size,
fprintf(stderr, "\nError: Not enough memory!\n");
}

if (primary_index > 0)
return result;
}

static int32_t decompress_memory_block(uint8_t * buffer, int32_t comressed_size, int32_t block_size)
{
RangeCoder coder;
coder.InitDecoder(buffer);

int32_t indexes[32] = { -1 };
int32_t decomressed_size = -1;
int32_t symbol_size = coder.DecodeValue(1, 2);
int32_t block_symbols = block_size / symbol_size;
int32_t r = next_power_of_2(std::max(block_symbols / 16, 1048576));

for (int32_t t = 0; t <= (block_symbols - 1) / r; ++t)
{
indexes[t] = coder.DecodeValue(1, block_symbols);
}

int32_t result = symbol_size == 1
? decompress_burrows_wheeler_transform<uint8_t> (&coder, indexes[0], block_size, buffer)
: decompress_burrows_wheeler_transform<uint16_t>(&coder, indexes[0], block_size, buffer);

if (result == 0)
{
if (int32_t * libsais_temp = (int32_t *)malloc(((size_t)block_symbols + 1) * sizeof(int32_t)))
{
int32_t result = symbol_size == 1
result = symbol_size == 1
? libsais_unbwt_aux(buffer, buffer, libsais_temp, block_symbols, root_frequencies, r, indexes)
: libsais16_unbwt_aux((uint16_t *)buffer, (uint16_t *)buffer, libsais_temp, block_symbols, root_frequencies, r, indexes);

Expand Down Expand Up @@ -250,7 +247,10 @@ static int compress_file(const char * input_file_name, const char * output_file_
break;
}

int32_t comressed_size = compress_memory_block(buffer, block_size, symbol_size);
int32_t comressed_size = symbol_size == 1
? compress_memory_block<uint8_t> (buffer, block_size)
: compress_memory_block<uint16_t>(buffer, block_size);

if (comressed_size <= 0) { break; }

if (fwrite(&block_size, sizeof(uint8_t), sizeof(block_size), output_file) != sizeof(block_size))
Expand Down Expand Up @@ -401,17 +401,17 @@ static int decompress_file(const char * input_file_name, const char * output_fil
static int print_usage()
{
fprintf(stdout, "Usage: bsc-m03 <e|d> input-file output-file <options>\n");
fprintf(stdout, " -b<size> Block size in bytes, default 128MB (memory usage is ~15x).\n");
fprintf(stdout, " -b<size> Block size in bytes, default 128MB (memory usage is ~13x).\n");
fprintf(stdout, " -w<8|16> Symbol width in bits.\n");

return 0;
}

int main(int argc, const char * argv[])
{
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.1.2 (7 December 2021).\n");
fprintf(stdout, "Copyright (c) 2021 Ilya Grebnov <Ilya.Grebnov@gmail.com>. ABSOLUTELY NO WARRANTY.\n");
fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco and Atsushi Komiya.\n\n");
fprintf(stdout, "bsc-m03 is experimental block sorting compressor. Version 0.2.0 (05 January 2022).\n");
fprintf(stdout, "Copyright (c) 2021-2022 Ilya Grebnov <Ilya.Grebnov@gmail.com>. ABSOLUTELY NO WARRANTY.\n");
fprintf(stdout, "This program is based on (at least) the work of Michael Maniscalco (see AUTHORS).\n\n");

int32_t max_block_size = 128 * 1024 * 1024;
int32_t symbol_width = 8;
Expand Down
Loading

0 comments on commit 0b4dc3c

Please sign in to comment.