Content-Length: 7353 | pFad | http://github.com/huggingface/datasets/pull/7411.patch
thub.com
From cce50b8fb7c640998c7c81fc350a754f0ebfbd52 Mon Sep 17 00:00:00 2001
From: Daniel King
Date: Mon, 17 Feb 2025 13:44:34 -0800
Subject: [PATCH 1/5] debug
---
src/datasets/arrow_dataset.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index ca5106a5b9d..cd4117fb76a 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -3175,11 +3175,16 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
shards_done += 1
logger.debug(f"Finished processing shard number {rank} of {num_shards}.")
transformed_shards[rank] = content
+ logger.debug(f"Shard set {rank}")
else:
pbar.update(content)
+ logger.debug("All shards processed")
+ logger.debug("End pool")
+ logger.debug("Out of pool")
# Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805)
for kwargs in kwargs_per_job:
del kwargs["shard"]
+ logger.debug("deld")
else:
logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
assert None not in transformed_shards, (
From d68bee84792525411730d44505ebbc8dc6487058 Mon Sep 17 00:00:00 2001
From: Daniel King
Date: Mon, 17 Feb 2025 14:45:14 -0800
Subject: [PATCH 2/5] close it
---
src/datasets/arrow_dataset.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index cd4117fb76a..c5beff645e6 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -3180,6 +3180,8 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
pbar.update(content)
logger.debug("All shards processed")
logger.debug("End pool")
+ pool.close()
+ pool.join()
logger.debug("Out of pool")
# Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805)
for kwargs in kwargs_per_job:
From c7ba3e4901b976a3f7513e8c9e1770c338906b35 Mon Sep 17 00:00:00 2001
From: Daniel King
Date: Mon, 17 Feb 2025 14:45:47 -0800
Subject: [PATCH 3/5] logs
---
src/datasets/arrow_dataset.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index c5beff645e6..ebd1741799a 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -3181,7 +3181,9 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
logger.debug("All shards processed")
logger.debug("End pool")
pool.close()
+ logger.debug('closed')
pool.join()
+ logger.debug('joined')
logger.debug("Out of pool")
# Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805)
for kwargs in kwargs_per_job:
From f0277546e903f367eb62484f7439251dd2700493 Mon Sep 17 00:00:00 2001
From: Daniel King
Date: Mon, 17 Feb 2025 15:30:24 -0800
Subject: [PATCH 4/5] remove logs
---
src/datasets/arrow_dataset.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index ebd1741799a..e1555775117 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -3175,20 +3175,20 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
shards_done += 1
logger.debug(f"Finished processing shard number {rank} of {num_shards}.")
transformed_shards[rank] = content
- logger.debug(f"Shard set {rank}")
+ # logger.debug(f"Shard set {rank}")
else:
pbar.update(content)
- logger.debug("All shards processed")
- logger.debug("End pool")
+ # logger.debug("All shards processed")
+ # logger.debug("End pool")
pool.close()
- logger.debug('closed')
+ # logger.debug('closed')
pool.join()
- logger.debug('joined')
- logger.debug("Out of pool")
+ # logger.debug('joined')
+ # logger.debug("Out of pool")
# Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805)
for kwargs in kwargs_per_job:
del kwargs["shard"]
- logger.debug("deld")
+ # logger.debug("deld")
else:
logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
assert None not in transformed_shards, (
From 127cf369c61f2b4900a2aaec536818513c791d2f Mon Sep 17 00:00:00 2001
From: Daniel King
Date: Mon, 17 Feb 2025 15:36:04 -0800
Subject: [PATCH 5/5] remove comments
---
src/datasets/arrow_dataset.py | 7 -------
1 file changed, 7 deletions(-)
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index e1555775117..3b9231fd139 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -3175,20 +3175,13 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
shards_done += 1
logger.debug(f"Finished processing shard number {rank} of {num_shards}.")
transformed_shards[rank] = content
- # logger.debug(f"Shard set {rank}")
else:
pbar.update(content)
- # logger.debug("All shards processed")
- # logger.debug("End pool")
pool.close()
- # logger.debug('closed')
pool.join()
- # logger.debug('joined')
- # logger.debug("Out of pool")
# Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805)
for kwargs in kwargs_per_job:
del kwargs["shard"]
- # logger.debug("deld")
else:
logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
assert None not in transformed_shards, (
--- a PPN by Garber Painting Akron. With Image Size Reduction included!Fetched URL: http://github.com/huggingface/datasets/pull/7411.patch
Alternative Proxies:
Alternative Proxy
pFad Proxy
pFad v3 Proxy
pFad v4 Proxy