Fix multi gpu map example #6415

lhoestq · 2023-11-14T14:57:18Z

use orch.cuda.set_device instead of CUDA_VISIBLE_DEVICES
add if __name__ == "__main__"

github-actions · 2023-11-14T15:02:18Z

Show benchmarks

PyArrow==8.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric	read_batch_formatted_as_numpy after write_array2d	read_batch_formatted_as_numpy after write_flattened_sequence	read_batch_formatted_as_numpy after write_nested_sequence	read_batch_unformated after write_array2d	read_batch_unformated after write_flattened_sequence	read_batch_unformated after write_nested_sequence	read_col_formatted_as_numpy after write_array2d	read_col_formatted_as_numpy after write_flattened_sequence	read_col_formatted_as_numpy after write_nested_sequence	read_col_unformated after write_array2d	read_col_unformated after write_flattened_sequence	read_col_unformated after write_nested_sequence	read_formatted_as_numpy after write_array2d	read_formatted_as_numpy after write_flattened_sequence	read_formatted_as_numpy after write_nested_sequence	read_unformated after write_array2d	read_unformated after write_flattened_sequence	read_unformated after write_nested_sequence	write_array2d	write_flattened_sequence	write_nested_sequence
new / old (diff)	0.004537 / 0.011353 (-0.006816)	0.002844 / 0.011008 (-0.008164)	0.062506 / 0.038508 (0.023998)	0.029675 / 0.023109 (0.006566)	0.238080 / 0.275898 (-0.037818)	0.259858 / 0.323480 (-0.063622)	0.004015 / 0.007986 (-0.003970)	0.002432 / 0.004328 (-0.001897)	0.049477 / 0.004250 (0.045227)	0.045383 / 0.037052 (0.008331)	0.241934 / 0.258489 (-0.016555)	0.270759 / 0.293841 (-0.023082)	0.023207 / 0.128546 (-0.105339)	0.007107 / 0.075646 (-0.068539)	0.207626 / 0.419271 (-0.211645)	0.056706 / 0.043533 (0.013173)	0.239713 / 0.255139 (-0.015426)	0.256639 / 0.283200 (-0.026560)	0.017514 / 0.141683 (-0.124169)	1.105201 / 1.452155 (-0.346953)	1.173087 / 1.492716 (-0.319629)

Benchmark: benchmark_getitem_100B.json

metric	get_batch_of_1024_random_rows	get_batch_of_1024_rows	get_first_row	get_last_row
new / old (diff)	0.093391 / 0.018006 (0.075384)	0.302673 / 0.000490 (0.302184)	0.000218 / 0.000200 (0.000018)	0.000043 / 0.000054 (-0.000011)

Benchmark: benchmark_indices_mapping.json

metric	select	shard	shuffle	sort	train_test_split
new / old (diff)	0.019447 / 0.037411 (-0.017965)	0.063349 / 0.014526 (0.048823)	0.075600 / 0.176557 (-0.100957)	0.121098 / 0.737135 (-0.616037)	0.075028 / 0.296338 (-0.221311)

Benchmark: benchmark_iterating.json

metric	read 5000	read 50000	read_batch 50000 10	read_batch 50000 100	read_batch 50000 1000	read_formatted numpy 5000	read_formatted pandas 5000	read_formatted tensorflow 5000	read_formatted torch 5000	read_formatted_batch numpy 5000 10	read_formatted_batch numpy 5000 1000	shuffled read 5000	shuffled read 50000	shuffled read_batch 50000 10	shuffled read_batch 50000 100	shuffled read_batch 50000 1000	shuffled read_formatted numpy 5000	shuffled read_formatted_batch numpy 5000 10	shuffled read_formatted_batch numpy 5000 1000
new / old (diff)	0.291479 / 0.215209 (0.076270)	2.787231 / 2.077655 (0.709576)	1.480205 / 1.504120 (-0.023915)	1.417656 / 1.541195 (-0.123538)	1.394529 / 1.468490 (-0.073962)	0.408843 / 4.584777 (-4.175934)	2.398691 / 3.745712 (-1.347021)	2.635457 / 5.269862 (-2.634404)	1.591722 / 4.565676 (-2.973955)	0.048445 / 0.424275 (-0.375830)	0.004864 / 0.007607 (-0.002743)	0.349014 / 0.226044 (0.122969)	3.436962 / 2.268929 (1.168033)	1.839266 / 55.444624 (-53.605359)	1.535252 / 6.876477 (-5.341225)	1.581048 / 2.142072 (-0.561025)	0.491150 / 4.805227 (-4.314078)	0.101279 / 6.500664 (-6.399385)	0.041938 / 0.075469 (-0.033532)

Benchmark: benchmark_map_filter.json

metric	filter	map fast-tokenizer batched	map identity	map identity batched	map no-op batched	map no-op batched numpy	map no-op batched pandas	map no-op batched pytorch	map no-op batched tensorflow
new / old (diff)	0.946986 / 1.841788 (-0.894801)	11.766196 / 8.074308 (3.691888)	10.425615 / 10.191392 (0.234223)	0.129957 / 0.680424 (-0.550467)	0.014859 / 0.534201 (-0.519342)	0.268046 / 0.579283 (-0.311237)	0.263724 / 0.434364 (-0.170640)	0.311028 / 0.540337 (-0.229309)	0.434715 / 1.386936 (-0.952221)

PyArrow==latest

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric	read_batch_formatted_as_numpy after write_array2d	read_batch_formatted_as_numpy after write_flattened_sequence	read_batch_formatted_as_numpy after write_nested_sequence	read_batch_unformated after write_array2d	read_batch_unformated after write_flattened_sequence	read_batch_unformated after write_nested_sequence	read_col_formatted_as_numpy after write_array2d	read_col_formatted_as_numpy after write_flattened_sequence	read_col_formatted_as_numpy after write_nested_sequence	read_col_unformated after write_array2d	read_col_unformated after write_flattened_sequence	read_col_unformated after write_nested_sequence	read_formatted_as_numpy after write_array2d	read_formatted_as_numpy after write_flattened_sequence	read_formatted_as_numpy after write_nested_sequence	read_unformated after write_array2d	read_unformated after write_flattened_sequence	read_unformated after write_nested_sequence	write_array2d	write_flattened_sequence	write_nested_sequence
new / old (diff)	0.004874 / 0.011353 (-0.006479)	0.002942 / 0.011008 (-0.008067)	0.048250 / 0.038508 (0.009742)	0.053726 / 0.023109 (0.030617)	0.268870 / 0.275898 (-0.007028)	0.289152 / 0.323480 (-0.034328)	0.003982 / 0.007986 (-0.004004)	0.002488 / 0.004328 (-0.001840)	0.047902 / 0.004250 (0.043652)	0.038732 / 0.037052 (0.001680)	0.271021 / 0.258489 (0.012532)	0.299967 / 0.293841 (0.006126)	0.024672 / 0.128546 (-0.103874)	0.007311 / 0.075646 (-0.068336)	0.053721 / 0.419271 (-0.365550)	0.032407 / 0.043533 (-0.011126)	0.266604 / 0.255139 (0.011465)	0.286816 / 0.283200 (0.003617)	0.018973 / 0.141683 (-0.122710)	1.122460 / 1.452155 (-0.329695)	1.177720 / 1.492716 (-0.314997)

Benchmark: benchmark_getitem_100B.json

metric	get_batch_of_1024_random_rows	get_batch_of_1024_rows	get_first_row	get_last_row
new / old (diff)	0.093968 / 0.018006 (0.075962)	0.304010 / 0.000490 (0.303521)	0.000228 / 0.000200 (0.000028)	0.000056 / 0.000054 (0.000002)

Benchmark: benchmark_indices_mapping.json

metric	select	shard	shuffle	sort	train_test_split
new / old (diff)	0.021203 / 0.037411 (-0.016208)	0.070318 / 0.014526 (0.055793)	0.081688 / 0.176557 (-0.094869)	0.120916 / 0.737135 (-0.616219)	0.083452 / 0.296338 (-0.212886)

Benchmark: benchmark_iterating.json

metric	read 5000	read 50000	read_batch 50000 10	read_batch 50000 100	read_batch 50000 1000	read_formatted numpy 5000	read_formatted pandas 5000	read_formatted tensorflow 5000	read_formatted torch 5000	read_formatted_batch numpy 5000 10	read_formatted_batch numpy 5000 1000	shuffled read 5000	shuffled read 50000	shuffled read_batch 50000 10	shuffled read_batch 50000 100	shuffled read_batch 50000 1000	shuffled read_formatted numpy 5000	shuffled read_formatted_batch numpy 5000 10	shuffled read_formatted_batch numpy 5000 1000
new / old (diff)	0.293961 / 0.215209 (0.078752)	2.858514 / 2.077655 (0.780860)	1.556169 / 1.504120 (0.052049)	1.431523 / 1.541195 (-0.109671)	1.478145 / 1.468490 (0.009654)	0.408927 / 4.584777 (-4.175850)	2.440630 / 3.745712 (-1.305082)	2.586327 / 5.269862 (-2.683534)	1.529495 / 4.565676 (-3.036182)	0.047387 / 0.424275 (-0.376888)	0.004817 / 0.007607 (-0.002790)	0.345009 / 0.226044 (0.118965)	3.386313 / 2.268929 (1.117384)	1.922361 / 55.444624 (-53.522264)	1.640814 / 6.876477 (-5.235663)	1.657005 / 2.142072 (-0.485068)	0.483844 / 4.805227 (-4.321383)	0.099470 / 6.500664 (-6.401194)	0.040735 / 0.075469 (-0.034734)

Benchmark: benchmark_map_filter.json

metric	filter	map fast-tokenizer batched	map identity	map identity batched	map no-op batched	map no-op batched numpy	map no-op batched pandas	map no-op batched pytorch	map no-op batched tensorflow
new / old (diff)	0.986311 / 1.841788 (-0.855476)	12.327425 / 8.074308 (4.253117)	10.995135 / 10.191392 (0.803743)	0.146814 / 0.680424 (-0.533610)	0.015820 / 0.534201 (-0.518381)	0.272319 / 0.579283 (-0.306964)	0.274858 / 0.434364 (-0.159506)	0.305728 / 0.540337 (-0.234609)	0.421400 / 1.386936 (-0.965536)

HuggingFaceDocBuilderDev · 2023-11-14T15:02:26Z

The documentation is not available anymore as the PR was closed or merged.

NielsRogge · 2023-11-14T17:56:17Z

docs/source/process.mdx

 >>> def gpu_computation(example, rank):
->>>     os.environ["CUDA_VISIBLE_DEVICES"] = str(rank % torch.cuda.device_count())
+>>>     torch.cuda.set_device(rank % torch.cuda.device_count())
 >>>     # Your big GPU call goes here


I still would like to see a concrete example here instead of "your big GPU call goes here", cause I tried using an NLLB model with 2 GPUs to translate sentences of the datacomp dataset in parallel and it was unclear for me how I had to do it. Should I use nn.DataParallel? Should I use .to("cuda:0") and .to("cuda:1")?

I remember that the rank was always set to 0, so all work was done on the first GPU

I remember that the rank was always set to 0, so all work was done on the first GPU

This happens only if you set num_proc=1, but for multiprocessing you get multiple ranks (one per process)

I added model in the example - no need to use nn.DataParallel. You just need to send the model to every GPU.

Feel free to test that the code works as expected for you !

NielsRogge · 2023-11-15T08:10:30Z

docs/source/process.mdx

+>>> for i in range(torch.cuda.device_count()):  # send model to every GPU
+...     model.to(torch.cuda.device(i))


This gives me the following error:

Traceback (most recent call last): File "/home/niels/python_projects/datacomp/datasets_multi_gpu.py", line 14, in <module> model.to(torch.cuda.device(i)) File "/home/niels/anaconda3/envs/datacomp/lib/python3.10/site-packages/torch/nn/modules/module.py", line 968, in to device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs) TypeError: to() received an invalid combination of arguments - got (device), but expected one of: * (torch.device device, torch.dtype dtype, bool non_blocking, bool copy, *, torch.memory_format memory_format) * (torch.dtype dtype, bool non_blocking, bool copy, *, torch.memory_format memory_format) * (Tensor tensor, bool non_blocking, bool copy, *, torch.memory_format memory_format)

I used this instead:

for i in range(torch.cuda.device_count()): # send model to every GPU model.to(f"cuda:{i}")

fixed it, thanks

github-actions · 2023-11-15T11:18:05Z

Show benchmarks

PyArrow==8.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric	read_batch_formatted_as_numpy after write_array2d	read_batch_formatted_as_numpy after write_flattened_sequence	read_batch_formatted_as_numpy after write_nested_sequence	read_batch_unformated after write_array2d	read_batch_unformated after write_flattened_sequence	read_batch_unformated after write_nested_sequence	read_col_formatted_as_numpy after write_array2d	read_col_formatted_as_numpy after write_flattened_sequence	read_col_formatted_as_numpy after write_nested_sequence	read_col_unformated after write_array2d	read_col_unformated after write_flattened_sequence	read_col_unformated after write_nested_sequence	read_formatted_as_numpy after write_array2d	read_formatted_as_numpy after write_flattened_sequence	read_formatted_as_numpy after write_nested_sequence	read_unformated after write_array2d	read_unformated after write_flattened_sequence	read_unformated after write_nested_sequence	write_array2d	write_flattened_sequence	write_nested_sequence
new / old (diff)	0.007995 / 0.011353 (-0.003358)	0.004596 / 0.011008 (-0.006412)	0.099818 / 0.038508 (0.061310)	0.053539 / 0.023109 (0.030429)	0.367757 / 0.275898 (0.091859)	0.409351 / 0.323480 (0.085871)	0.007423 / 0.007986 (-0.000563)	0.003770 / 0.004328 (-0.000558)	0.075635 / 0.004250 (0.071385)	0.078844 / 0.037052 (0.041791)	0.374523 / 0.258489 (0.116034)	0.423378 / 0.293841 (0.129537)	0.038901 / 0.128546 (-0.089645)	0.009985 / 0.075646 (-0.065661)	0.342793 / 0.419271 (-0.076479)	0.098045 / 0.043533 (0.054512)	0.368077 / 0.255139 (0.112938)	0.394251 / 0.283200 (0.111051)	0.030624 / 0.141683 (-0.111059)	1.782728 / 1.452155 (0.330574)	1.867571 / 1.492716 (0.374855)

Benchmark: benchmark_getitem_100B.json

metric	get_batch_of_1024_random_rows	get_batch_of_1024_rows	get_first_row	get_last_row
new / old (diff)	0.265550 / 0.018006 (0.247544)	0.504045 / 0.000490 (0.503555)	0.016523 / 0.000200 (0.016323)	0.000757 / 0.000054 (0.000702)

Benchmark: benchmark_indices_mapping.json

metric	select	shard	shuffle	sort	train_test_split
new / old (diff)	0.034239 / 0.037411 (-0.003172)	0.099953 / 0.014526 (0.085427)	0.113728 / 0.176557 (-0.062829)	0.180113 / 0.737135 (-0.557023)	0.114506 / 0.296338 (-0.181833)

Benchmark: benchmark_iterating.json

metric	read 5000	read 50000	read_batch 50000 10	read_batch 50000 100	read_batch 50000 1000	read_formatted numpy 5000	read_formatted pandas 5000	read_formatted tensorflow 5000	read_formatted torch 5000	read_formatted_batch numpy 5000 10	read_formatted_batch numpy 5000 1000	shuffled read 5000	shuffled read 50000	shuffled read_batch 50000 10	shuffled read_batch 50000 100	shuffled read_batch 50000 1000	shuffled read_formatted numpy 5000	shuffled read_formatted_batch numpy 5000 10	shuffled read_formatted_batch numpy 5000 1000
new / old (diff)	0.507186 / 0.215209 (0.291977)	5.033590 / 2.077655 (2.955935)	2.480111 / 1.504120 (0.975991)	2.258966 / 1.541195 (0.717771)	2.316045 / 1.468490 (0.847555)	0.622482 / 4.584777 (-3.962295)	4.400909 / 3.745712 (0.655197)	4.012443 / 5.269862 (-1.257419)	2.408294 / 4.565676 (-2.157383)	0.067608 / 0.424275 (-0.356668)	0.008638 / 0.007607 (0.001031)	0.546558 / 0.226044 (0.320513)	5.472973 / 2.268929 (3.204044)	2.795147 / 55.444624 (-52.649477)	2.371153 / 6.876477 (-4.505324)	2.440883 / 2.142072 (0.298811)	0.682380 / 4.805227 (-4.122847)	0.156819 / 6.500664 (-6.343845)	0.071969 / 0.075469 (-0.003500)

Benchmark: benchmark_map_filter.json

metric	filter	map fast-tokenizer batched	map identity	map identity batched	map no-op batched	map no-op batched numpy	map no-op batched pandas	map no-op batched pytorch	map no-op batched tensorflow
new / old (diff)	1.500200 / 1.841788 (-0.341588)	22.854103 / 8.074308 (14.779795)	16.691945 / 10.191392 (6.500553)	0.210945 / 0.680424 (-0.469479)	0.023234 / 0.534201 (-0.510967)	0.475641 / 0.579283 (-0.103642)	0.491553 / 0.434364 (0.057189)	0.549311 / 0.540337 (0.008974)	0.858498 / 1.386936 (-0.528439)

PyArrow==latest

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric	read_batch_formatted_as_numpy after write_array2d	read_batch_formatted_as_numpy after write_flattened_sequence	read_batch_formatted_as_numpy after write_nested_sequence	read_batch_unformated after write_array2d	read_batch_unformated after write_flattened_sequence	read_batch_unformated after write_nested_sequence	read_col_formatted_as_numpy after write_array2d	read_col_formatted_as_numpy after write_flattened_sequence	read_col_formatted_as_numpy after write_nested_sequence	read_col_unformated after write_array2d	read_col_unformated after write_flattened_sequence	read_col_unformated after write_nested_sequence	read_formatted_as_numpy after write_array2d	read_formatted_as_numpy after write_flattened_sequence	read_formatted_as_numpy after write_nested_sequence	read_unformated after write_array2d	read_unformated after write_flattened_sequence	read_unformated after write_nested_sequence	write_array2d	write_flattened_sequence	write_nested_sequence
new / old (diff)	0.009020 / 0.011353 (-0.002333)	0.004768 / 0.011008 (-0.006240)	0.082841 / 0.038508 (0.044333)	0.095111 / 0.023109 (0.072002)	0.486050 / 0.275898 (0.210151)	0.527074 / 0.323480 (0.203594)	0.006622 / 0.007986 (-0.001364)	0.003961 / 0.004328 (-0.000367)	0.083361 / 0.004250 (0.079111)	0.068571 / 0.037052 (0.031518)	0.494575 / 0.258489 (0.236086)	0.545593 / 0.293841 (0.251752)	0.047671 / 0.128546 (-0.080875)	0.010715 / 0.075646 (-0.064932)	0.096239 / 0.419271 (-0.323033)	0.061556 / 0.043533 (0.018023)	0.484301 / 0.255139 (0.229162)	0.492189 / 0.283200 (0.208989)	0.029374 / 0.141683 (-0.112309)	1.911833 / 1.452155 (0.459678)	2.005744 / 1.492716 (0.513028)

Benchmark: benchmark_getitem_100B.json

metric	get_batch_of_1024_random_rows	get_batch_of_1024_rows	get_first_row	get_last_row
new / old (diff)	0.265402 / 0.018006 (0.247396)	0.501034 / 0.000490 (0.500545)	0.004039 / 0.000200 (0.003839)	0.000105 / 0.000054 (0.000051)

Benchmark: benchmark_indices_mapping.json

metric	select	shard	shuffle	sort	train_test_split
new / old (diff)	0.041005 / 0.037411 (0.003594)	0.119204 / 0.014526 (0.104678)	0.134583 / 0.176557 (-0.041973)	0.195995 / 0.737135 (-0.541140)	0.133125 / 0.296338 (-0.163214)

Benchmark: benchmark_iterating.json

metric	read 5000	read 50000	read_batch 50000 10	read_batch 50000 100	read_batch 50000 1000	read_formatted numpy 5000	read_formatted pandas 5000	read_formatted tensorflow 5000	read_formatted torch 5000	read_formatted_batch numpy 5000 10	read_formatted_batch numpy 5000 1000	shuffled read 5000	shuffled read 50000	shuffled read_batch 50000 10	shuffled read_batch 50000 100	shuffled read_batch 50000 1000	shuffled read_formatted numpy 5000	shuffled read_formatted_batch numpy 5000 10	shuffled read_formatted_batch numpy 5000 1000
new / old (diff)	0.503012 / 0.215209 (0.287803)	5.021972 / 2.077655 (2.944318)	2.912987 / 1.504120 (1.408867)	2.707637 / 1.541195 (1.166442)	2.824065 / 1.468490 (1.355575)	0.664285 / 4.584777 (-3.920492)	4.341905 / 3.745712 (0.596193)	4.152839 / 5.269862 (-1.117022)	2.438138 / 4.565676 (-2.127539)	0.076169 / 0.424275 (-0.348106)	0.010471 / 0.007607 (0.002864)	0.680918 / 0.226044 (0.454874)	6.424209 / 2.268929 (4.155281)	3.285353 / 55.444624 (-52.159271)	2.865458 / 6.876477 (-4.011019)	2.946246 / 2.142072 (0.804173)	0.700051 / 4.805227 (-4.105176)	0.155299 / 6.500664 (-6.345365)	0.069372 / 0.075469 (-0.006097)

Benchmark: benchmark_map_filter.json

metric	filter	map fast-tokenizer batched	map identity	map identity batched	map no-op batched	map no-op batched numpy	map no-op batched pandas	map no-op batched pytorch	map no-op batched tensorflow
new / old (diff)	1.749517 / 1.841788 (-0.092271)	23.382582 / 8.074308 (15.308274)	17.708718 / 10.191392 (7.517326)	0.197042 / 0.680424 (-0.483382)	0.023874 / 0.534201 (-0.510327)	0.471631 / 0.579283 (-0.107652)	0.512649 / 0.434364 (0.078285)	0.614479 / 0.540337 (0.074142)	0.771859 / 1.386936 (-0.615077)

lhoestq · 2023-11-22T15:42:09Z

Merging this one, but lmk if you have more comments for subsequent improvements @NielsRogge

github-actions · 2023-11-22T15:48:26Z

Show benchmarks

PyArrow==8.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric	read_batch_formatted_as_numpy after write_array2d	read_batch_formatted_as_numpy after write_flattened_sequence	read_batch_formatted_as_numpy after write_nested_sequence	read_batch_unformated after write_array2d	read_batch_unformated after write_flattened_sequence	read_batch_unformated after write_nested_sequence	read_col_formatted_as_numpy after write_array2d	read_col_formatted_as_numpy after write_flattened_sequence	read_col_formatted_as_numpy after write_nested_sequence	read_col_unformated after write_array2d	read_col_unformated after write_flattened_sequence	read_col_unformated after write_nested_sequence	read_formatted_as_numpy after write_array2d	read_formatted_as_numpy after write_flattened_sequence	read_formatted_as_numpy after write_nested_sequence	read_unformated after write_array2d	read_unformated after write_flattened_sequence	read_unformated after write_nested_sequence	write_array2d	write_flattened_sequence	write_nested_sequence
new / old (diff)	0.004874 / 0.011353 (-0.006479)	0.002866 / 0.011008 (-0.008142)	0.061761 / 0.038508 (0.023253)	0.052185 / 0.023109 (0.029076)	0.242264 / 0.275898 (-0.033634)	0.267816 / 0.323480 (-0.055664)	0.002844 / 0.007986 (-0.005142)	0.002349 / 0.004328 (-0.001979)	0.048393 / 0.004250 (0.044142)	0.038590 / 0.037052 (0.001538)	0.257483 / 0.258489 (-0.001006)	0.279704 / 0.293841 (-0.014137)	0.023125 / 0.128546 (-0.105421)	0.007044 / 0.075646 (-0.068602)	0.203606 / 0.419271 (-0.215665)	0.035489 / 0.043533 (-0.008044)	0.248419 / 0.255139 (-0.006719)	0.266357 / 0.283200 (-0.016843)	0.020178 / 0.141683 (-0.121505)	1.163674 / 1.452155 (-0.288481)	1.191340 / 1.492716 (-0.301376)

Benchmark: benchmark_getitem_100B.json

metric	get_batch_of_1024_random_rows	get_batch_of_1024_rows	get_first_row	get_last_row
new / old (diff)	0.092972 / 0.018006 (0.074966)	0.295260 / 0.000490 (0.294770)	0.000214 / 0.000200 (0.000014)	0.000050 / 0.000054 (-0.000004)

Benchmark: benchmark_indices_mapping.json

metric	select	shard	shuffle	sort	train_test_split
new / old (diff)	0.018109 / 0.037411 (-0.019302)	0.061743 / 0.014526 (0.047217)	0.073965 / 0.176557 (-0.102592)	0.119493 / 0.737135 (-0.617642)	0.075646 / 0.296338 (-0.220692)

Benchmark: benchmark_iterating.json

metric	read 5000	read 50000	read_batch 50000 10	read_batch 50000 100	read_batch 50000 1000	read_formatted numpy 5000	read_formatted pandas 5000	read_formatted tensorflow 5000	read_formatted torch 5000	read_formatted_batch numpy 5000 10	read_formatted_batch numpy 5000 1000	shuffled read 5000	shuffled read 50000	shuffled read_batch 50000 10	shuffled read_batch 50000 100	shuffled read_batch 50000 1000	shuffled read_formatted numpy 5000	shuffled read_formatted_batch numpy 5000 10	shuffled read_formatted_batch numpy 5000 1000
new / old (diff)	0.275700 / 0.215209 (0.060491)	2.666846 / 2.077655 (0.589191)	1.401452 / 1.504120 (-0.102668)	1.276009 / 1.541195 (-0.265186)	1.309914 / 1.468490 (-0.158576)	0.396411 / 4.584777 (-4.188365)	2.347193 / 3.745712 (-1.398519)	2.568006 / 5.269862 (-2.701856)	1.564572 / 4.565676 (-3.001105)	0.045450 / 0.424275 (-0.378825)	0.004827 / 0.007607 (-0.002780)	0.333092 / 0.226044 (0.107048)	3.284295 / 2.268929 (1.015367)	1.809928 / 55.444624 (-53.634696)	1.486041 / 6.876477 (-5.390436)	1.528198 / 2.142072 (-0.613875)	0.470053 / 4.805227 (-4.335174)	0.098559 / 6.500664 (-6.402105)	0.041637 / 0.075469 (-0.033832)

Benchmark: benchmark_map_filter.json

metric	filter	map fast-tokenizer batched	map identity	map identity batched	map no-op batched	map no-op batched numpy	map no-op batched pandas	map no-op batched pytorch	map no-op batched tensorflow
new / old (diff)	0.948915 / 1.841788 (-0.892873)	11.513211 / 8.074308 (3.438903)	10.386419 / 10.191392 (0.195027)	0.129513 / 0.680424 (-0.550910)	0.021772 / 0.534201 (-0.512429)	0.295627 / 0.579283 (-0.283656)	0.261008 / 0.434364 (-0.173355)	0.305869 / 0.540337 (-0.234469)	0.399676 / 1.386936 (-0.987260)

PyArrow==latest

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric	read_batch_formatted_as_numpy after write_array2d	read_batch_formatted_as_numpy after write_flattened_sequence	read_batch_formatted_as_numpy after write_nested_sequence	read_batch_unformated after write_array2d	read_batch_unformated after write_flattened_sequence	read_batch_unformated after write_nested_sequence	read_col_formatted_as_numpy after write_array2d	read_col_formatted_as_numpy after write_flattened_sequence	read_col_formatted_as_numpy after write_nested_sequence	read_col_unformated after write_array2d	read_col_unformated after write_flattened_sequence	read_col_unformated after write_nested_sequence	read_formatted_as_numpy after write_array2d	read_formatted_as_numpy after write_flattened_sequence	read_formatted_as_numpy after write_nested_sequence	read_unformated after write_array2d	read_unformated after write_flattened_sequence	read_unformated after write_nested_sequence	write_array2d	write_flattened_sequence	write_nested_sequence
new / old (diff)	0.004799 / 0.011353 (-0.006553)	0.002764 / 0.011008 (-0.008244)	0.048469 / 0.038508 (0.009961)	0.051346 / 0.023109 (0.028236)	0.274853 / 0.275898 (-0.001045)	0.300770 / 0.323480 (-0.022710)	0.003986 / 0.007986 (-0.003999)	0.002376 / 0.004328 (-0.001952)	0.048545 / 0.004250 (0.044294)	0.039854 / 0.037052 (0.002801)	0.280053 / 0.258489 (0.021564)	0.312797 / 0.293841 (0.018957)	0.024513 / 0.128546 (-0.104033)	0.006971 / 0.075646 (-0.068675)	0.053030 / 0.419271 (-0.366241)	0.035580 / 0.043533 (-0.007953)	0.276078 / 0.255139 (0.020939)	0.299345 / 0.283200 (0.016145)	0.020423 / 0.141683 (-0.121260)	1.103053 / 1.452155 (-0.349102)	1.179747 / 1.492716 (-0.312969)

Benchmark: benchmark_getitem_100B.json

metric	get_batch_of_1024_random_rows	get_batch_of_1024_rows	get_first_row	get_last_row
new / old (diff)	0.093042 / 0.018006 (0.075036)	0.299421 / 0.000490 (0.298932)	0.000232 / 0.000200 (0.000033)	0.000052 / 0.000054 (-0.000002)

Benchmark: benchmark_indices_mapping.json

metric	select	shard	shuffle	sort	train_test_split
new / old (diff)	0.021966 / 0.037411 (-0.015445)	0.070978 / 0.014526 (0.056452)	0.083841 / 0.176557 (-0.092715)	0.121223 / 0.737135 (-0.615912)	0.082829 / 0.296338 (-0.213510)

Benchmark: benchmark_iterating.json

metric	read 5000	read 50000	read_batch 50000 10	read_batch 50000 100	read_batch 50000 1000	read_formatted numpy 5000	read_formatted pandas 5000	read_formatted tensorflow 5000	read_formatted torch 5000	read_formatted_batch numpy 5000 10	read_formatted_batch numpy 5000 1000	shuffled read 5000	shuffled read 50000	shuffled read_batch 50000 10	shuffled read_batch 50000 100	shuffled read_batch 50000 1000	shuffled read_formatted numpy 5000	shuffled read_formatted_batch numpy 5000 10	shuffled read_formatted_batch numpy 5000 1000
new / old (diff)	0.289436 / 0.215209 (0.074227)	2.838074 / 2.077655 (0.760419)	1.597013 / 1.504120 (0.092893)	1.476888 / 1.541195 (-0.064307)	1.504582 / 1.468490 (0.036092)	0.398050 / 4.584777 (-4.186727)	2.434446 / 3.745712 (-1.311266)	2.493545 / 5.269862 (-2.776316)	1.584159 / 4.565676 (-2.981517)	0.046461 / 0.424275 (-0.377814)	0.004876 / 0.007607 (-0.002731)	0.344166 / 0.226044 (0.118122)	3.388530 / 2.268929 (1.119602)	1.939585 / 55.444624 (-53.505039)	1.672495 / 6.876477 (-5.203982)	1.811825 / 2.142072 (-0.330247)	0.470798 / 4.805227 (-4.334429)	0.097522 / 6.500664 (-6.403142)	0.040887 / 0.075469 (-0.034582)

Benchmark: benchmark_map_filter.json

metric	filter	map fast-tokenizer batched	map identity	map identity batched	map no-op batched	map no-op batched numpy	map no-op batched pandas	map no-op batched pytorch	map no-op batched tensorflow
new / old (diff)	0.990081 / 1.841788 (-0.851707)	12.619827 / 8.074308 (4.545519)	10.748062 / 10.191392 (0.556670)	0.130409 / 0.680424 (-0.550015)	0.016624 / 0.534201 (-0.517577)	0.272381 / 0.579283 (-0.306902)	0.270597 / 0.434364 (-0.163767)	0.306458 / 0.540337 (-0.233879)	0.408700 / 1.386936 (-0.978236)

alex2awesome · 2023-12-29T07:49:04Z

This is a little hard to follow — where is the documentation currently? I am trying to follow from snippets, here is what I have based on your convo in this thread:

>>>
>>> for i in range(torch.cuda.device_count()):  # send model to every GPU
...     model.to(f"cuda:{i}")
>>>
>>> def gpu_computation(example, rank):
...     torch.cuda.set_device(f"cuda:{rank}")  # use one GPU
...     inputs = tokenizer(texts, truncation=True, return_tensors="pt").to(f"cuda:{rank}")
...     outputs = model(**inputs)
...      ....

but I'm getting device errors (data is on device 3, but it thinks model is on device 0, despite setting torch.cuda.set_device

Is this correct? What version of Torch are you using for this?

alex2awesome · 2023-12-29T18:38:44Z

Anyway, this didn't work for me:

torch.cuda.set_device(f"cuda:{rank}") # use one GPU

but substituting it for:

model.to(f"cuda:{rank}")

(.to doesn't make a million copies of the model on the device, which I was worried it would do... so you can use it in an inner process)

(btw, versions: torch==2.1.1, cuda=12.2)

NielsRogge · 2023-12-30T10:54:26Z

Yeah for me this issue isn't resolved yet, we need a better code example

lhoestq · 2023-12-30T15:01:34Z

Hi @alex2awesome, could you open a PR with your suggestion to improve this code snippet ?

alex2awesome · 2023-12-31T04:17:12Z

i'm happy to when i get it fully working, but i feel like there are some fundamentals that I'm not fully understanding...

I've set it up twice now, for 2 GPU-processing pipelines.

In one pipelines, my memory usage is fine, it delivers me a huge speedup, and everything is great. In the second pipeline, I keep getting OOM errors when num_proc > 1 that I don't get when num_proc=1.

There is a discussion here: pytorch/pytorch#44156 about CUDA memory leaks in multiprocessing setups, and I haven't had the time to fully read the source code to datasets.map to understand whether the situations are parallel. Also, if they are, then I don't know what the solution is, not really knowing how it is implemented under the hood. In that discussion, one guy offers a work-around, but it doesn't look great.

So, I haven't fully tested out enough to see what the issue. If I feel comfortable over the next several days to generate a slimmed-down example that will generalize to real-world cases such as those I'm working with now, then I will contribute it.

alex2awesome · 2023-12-31T06:41:05Z

@lhoestq do you know how datasets does multiprocessing? Do we use:
https://pytorch.org/docs/stable/multiprocessing.html#module-torch.multiprocessing?

If so, there are lots of points around memory usage, here:
https://pytorch.org/docs/stable/notes/multiprocessing.html

EDIT: ahh I see it is using python's native multiprocessing library: https://github.com/huggingface/datasets/blob/2.15.0/src/datasets/arrow_dataset.py#L3172-L3189

alex2awesome · 2023-12-31T19:55:42Z

After some more research and playing around, I can't pinpoint the source of my CUDA memory leak nor can I determine with confidence what works and what doesn't in this setup.

I'm not really an expert on multiprocessing in general, but my gut is that the current set-up isn't ideal for multiprocessing and I'm not sure I would recommend users to do this.

Kinda unfortunate, because I don't see any great tools for distributed inference out there, and in theory, datasets.map could be the standard.

Are either of you more experienced in this?

lhoestq · 2024-01-02T15:13:07Z

Not sure about your GPU's OOM :/

Still, I opened a PR with your suggestion here: #6550

kopyl · 2024-01-27T17:38:51Z

I still get only 0 rank...

Here is my code: https://pastebin.com/c6du8jaM

from this ^ i just improt one function:

from test import map_train
from multiprocess import set_start_method


set_start_method("spawn")
map_train()

And here is the traceback:
https://pastebin.com/YijspwQK

kopyl · 2024-01-27T17:55:50Z

Also this code from your docs is not valid (source: https://huggingface.co/docs/datasets/main/en/process#multiprocessing):

for i in range(torch.cuda.device_count()):
    model.to(f"cuda:{i}")

This for me sends the model only to the second GPU

vae = AutoencoderKL.from_pretrained(
    pretrained_model_name_or_path, subfolder="vae"
)
vae.to("cuda:0")
vae.to("cuda:1")

kopyl · 2024-01-27T18:22:13Z

Could you please provide a working example of multi-GPU mapping?

Not just an example in docs, but a real working example starting from all imports loading datasets and models.

kopyl · 2024-01-27T18:30:44Z

@alex2awesome the same issue with CUDA OOM. It should not be happening, since it should 2 different GPUs be handling different loads. But in fact something wrong is happening.

lhoestq · 2024-01-29T15:17:25Z

I haven't experimented much with the multi-GPU code documentation.

Can you try using the code example at #6550 instead ? That would be super helpful if you could confirm that it works on your side

Though if you have some fixes/improvements ideas feel free to open a PR !

kopyl · 2024-01-30T03:09:24Z

@lhoestq the mapping does not start at all in this case:

Here is the updated code: https://pastebin.com/Kn9aGfZr

kopyl · 2024-01-30T03:44:59Z

@lhoestq with this code: https://pastebin.com/muDm78kp
i now getting this error:

Map (num_proc=2):   1%|       | 26288/3043663 [06:11<11:51:08, 70.72 examples/s]
Traceback (most recent call last):
  File "/workspace/compute.py", line 229, in <module>
    map_train()
  File "/workspace/compute.py", line 224, in map_train
    return train_dataset.map(compute_embeddings_fn, batched=True, batch_size=16, with_rank=True, num_proc=2, keep_in_memory=True)
  File "/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py", line 593, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py", line 558, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py", line 3193, in map
    for rank, done, content in iflatmap_unordered(
  File "/usr/local/lib/python3.10/dist-packages/datasets/utils/py_utils.py", line 658, in iflatmap_unordered
    raise RuntimeError(
RuntimeError: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.

Also when trying to download my dataset there were no issues from one machine, but from another:

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/datasets/kopyl/3M_icons_monochrome_only_no_captioning/revision/753dca4be462dad7022f34cc273555ab6deb5832 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))"), '(Request ID: 7d0881f3-1b93-4d73-bcb6-52e816d84529)')

Can't download my dataset at all...

lhoestq · 2024-01-30T10:17:36Z

Hmm this is not good, do you know a way to make it work ?

Basically map creates two subprocesses and runs the function in the subprocesses. Since each function has a parameter rank it should be possible to choose which GPU to use

forrestbao · 2024-01-31T00:47:52Z

I can confirm that PR #6550 works. All GPUs are at full throttle. You have to manually move the model to all GPUs.

I haven't experimented much with the multi-GPU code documentation.

Can you try using the code example at #6550 instead ? That would be super helpful if you could confirm that it works on your side

Though if you have some fixes/improvements ideas feel free to open a PR !

forrestbao · 2024-01-31T00:49:15Z

I wrote a blog post with a complete example by compiling information from several PRs and issues here. Hope it can help. Let me know how it works.

Could you please provide a working example of multi-GPU mapping?

Not just an example in docs, but a real working example starting from all imports loading datasets and models.

Fix multi gpu map example

611a03d

lhoestq requested a review from NielsRogge November 14, 2023 14:58

NielsRogge reviewed Nov 14, 2023

View reviewed changes

add model usage in example

37d8286

NielsRogge reviewed Nov 15, 2023

View reviewed changes

fix device

4f084b2

lhoestq merged commit bc44d21 into main Nov 22, 2023
10 of 13 checks passed

lhoestq deleted the fix-multi-gpu-map branch November 22, 2023 15:42

lhoestq mentioned this pull request Jan 2, 2024

Multi gpu docs #6550

Merged

		>>> for i in range(torch.cuda.device_count()): # send model to every GPU
		... model.to(torch.cuda.device(i))

Fix multi gpu map example #6415

Fix multi gpu map example #6415

Conversation

lhoestq commented Nov 14, 2023

github-actions bot commented Nov 14, 2023

Benchmark: benchmark_array_xd.json

Benchmark: benchmark_getitem_100B.json

Benchmark: benchmark_indices_mapping.json

Benchmark: benchmark_iterating.json

Benchmark: benchmark_map_filter.json

Benchmark: benchmark_array_xd.json

Benchmark: benchmark_getitem_100B.json

Benchmark: benchmark_indices_mapping.json

Benchmark: benchmark_iterating.json

Benchmark: benchmark_map_filter.json

HuggingFaceDocBuilderDev commented Nov 14, 2023 • edited Loading

NielsRogge Nov 14, 2023

Choose a reason for hiding this comment

NielsRogge Nov 14, 2023

Choose a reason for hiding this comment

lhoestq Nov 14, 2023

Choose a reason for hiding this comment

lhoestq Nov 14, 2023

Choose a reason for hiding this comment

NielsRogge Nov 15, 2023 • edited Loading

Choose a reason for hiding this comment

lhoestq Nov 15, 2023

Choose a reason for hiding this comment

github-actions bot commented Nov 15, 2023

Benchmark: benchmark_array_xd.json

Benchmark: benchmark_getitem_100B.json

Benchmark: benchmark_indices_mapping.json

Benchmark: benchmark_iterating.json

Benchmark: benchmark_map_filter.json

Benchmark: benchmark_array_xd.json

Benchmark: benchmark_getitem_100B.json

Benchmark: benchmark_indices_mapping.json

Benchmark: benchmark_iterating.json

Benchmark: benchmark_map_filter.json

lhoestq commented Nov 22, 2023

github-actions bot commented Nov 22, 2023

Benchmark: benchmark_array_xd.json

Benchmark: benchmark_getitem_100B.json

Benchmark: benchmark_indices_mapping.json

Benchmark: benchmark_iterating.json

Benchmark: benchmark_map_filter.json

Benchmark: benchmark_array_xd.json

Benchmark: benchmark_getitem_100B.json

Benchmark: benchmark_indices_mapping.json

Benchmark: benchmark_iterating.json

Benchmark: benchmark_map_filter.json

alex2awesome commented Dec 29, 2023 • edited Loading

alex2awesome commented Dec 29, 2023 • edited Loading

NielsRogge commented Dec 30, 2023

lhoestq commented Dec 30, 2023

alex2awesome commented Dec 31, 2023 • edited Loading

alex2awesome commented Dec 31, 2023 • edited Loading

alex2awesome commented Dec 31, 2023 • edited Loading

lhoestq commented Jan 2, 2024

kopyl commented Jan 27, 2024

kopyl commented Jan 27, 2024

kopyl commented Jan 27, 2024 • edited Loading

kopyl commented Jan 27, 2024

lhoestq commented Jan 29, 2024 • edited Loading

kopyl commented Jan 30, 2024 • edited Loading

kopyl commented Jan 30, 2024

lhoestq commented Jan 30, 2024

forrestbao commented Jan 31, 2024

forrestbao commented Jan 31, 2024

HuggingFaceDocBuilderDev commented Nov 14, 2023 •

edited

Loading

NielsRogge Nov 15, 2023 •

edited

Loading

alex2awesome commented Dec 29, 2023 •

edited

Loading

alex2awesome commented Dec 29, 2023 •

edited

Loading

alex2awesome commented Dec 31, 2023 •

edited

Loading

alex2awesome commented Dec 31, 2023 •

edited

Loading

alex2awesome commented Dec 31, 2023 •

edited

Loading

kopyl commented Jan 27, 2024 •

edited

Loading

lhoestq commented Jan 29, 2024 •

edited

Loading

kopyl commented Jan 30, 2024 •

edited

Loading