Content-Length: 7353 | pFad | http://github.com/huggingface/datasets/pull/7411.patch

thub.com From cce50b8fb7c640998c7c81fc350a754f0ebfbd52 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 17 Feb 2025 13:44:34 -0800 Subject: [PATCH 1/5] debug --- src/datasets/arrow_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index ca5106a5b9d..cd4117fb76a 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -3175,11 +3175,16 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str: shards_done += 1 logger.debug(f"Finished processing shard number {rank} of {num_shards}.") transformed_shards[rank] = content + logger.debug(f"Shard set {rank}") else: pbar.update(content) + logger.debug("All shards processed") + logger.debug("End pool") + logger.debug("Out of pool") # Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805) for kwargs in kwargs_per_job: del kwargs["shard"] + logger.debug("deld") else: logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}") assert None not in transformed_shards, ( From d68bee84792525411730d44505ebbc8dc6487058 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 17 Feb 2025 14:45:14 -0800 Subject: [PATCH 2/5] close it --- src/datasets/arrow_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index cd4117fb76a..c5beff645e6 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -3180,6 +3180,8 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str: pbar.update(content) logger.debug("All shards processed") logger.debug("End pool") + pool.close() + pool.join() logger.debug("Out of pool") # Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805) for kwargs in kwargs_per_job: From c7ba3e4901b976a3f7513e8c9e1770c338906b35 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 17 Feb 2025 14:45:47 -0800 Subject: [PATCH 3/5] logs --- src/datasets/arrow_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c5beff645e6..ebd1741799a 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -3181,7 +3181,9 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str: logger.debug("All shards processed") logger.debug("End pool") pool.close() + logger.debug('closed') pool.join() + logger.debug('joined') logger.debug("Out of pool") # Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805) for kwargs in kwargs_per_job: From f0277546e903f367eb62484f7439251dd2700493 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 17 Feb 2025 15:30:24 -0800 Subject: [PATCH 4/5] remove logs --- src/datasets/arrow_dataset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index ebd1741799a..e1555775117 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -3175,20 +3175,20 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str: shards_done += 1 logger.debug(f"Finished processing shard number {rank} of {num_shards}.") transformed_shards[rank] = content - logger.debug(f"Shard set {rank}") + # logger.debug(f"Shard set {rank}") else: pbar.update(content) - logger.debug("All shards processed") - logger.debug("End pool") + # logger.debug("All shards processed") + # logger.debug("End pool") pool.close() - logger.debug('closed') + # logger.debug('closed') pool.join() - logger.debug('joined') - logger.debug("Out of pool") + # logger.debug('joined') + # logger.debug("Out of pool") # Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805) for kwargs in kwargs_per_job: del kwargs["shard"] - logger.debug("deld") + # logger.debug("deld") else: logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}") assert None not in transformed_shards, ( From 127cf369c61f2b4900a2aaec536818513c791d2f Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 17 Feb 2025 15:36:04 -0800 Subject: [PATCH 5/5] remove comments --- src/datasets/arrow_dataset.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index e1555775117..3b9231fd139 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -3175,20 +3175,13 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str: shards_done += 1 logger.debug(f"Finished processing shard number {rank} of {num_shards}.") transformed_shards[rank] = content - # logger.debug(f"Shard set {rank}") else: pbar.update(content) - # logger.debug("All shards processed") - # logger.debug("End pool") pool.close() - # logger.debug('closed') pool.join() - # logger.debug('joined') - # logger.debug("Out of pool") # Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805) for kwargs in kwargs_per_job: del kwargs["shard"] - # logger.debug("deld") else: logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}") assert None not in transformed_shards, (








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/huggingface/datasets/pull/7411.patch

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy