1
1
"""Upload a batch of inferences to the Openlayer platform."""
2
2
3
- import os
4
3
import time
5
- import shutil
6
- import tarfile
7
4
import tempfile
8
5
from typing import Optional
9
6
10
7
import httpx
11
8
import pandas as pd
9
+ import pyarrow as pa
12
10
13
11
from . import StorageType , _upload
14
- from .. import utils
15
12
from ... import Openlayer
16
13
from ..._utils import maybe_transform
17
14
from ...types .inference_pipelines import data_stream_params
15
+ import asyncio
18
16
19
17
20
- def upload_batch_inferences (
18
+ async def upload_batch_inferences_async (
21
19
client : Openlayer ,
22
20
inference_pipeline_id : str ,
23
21
config : data_stream_params .Config ,
24
22
dataset_df : Optional [pd .DataFrame ] = None ,
25
23
dataset_path : Optional [str ] = None ,
26
24
storage_type : Optional [StorageType ] = None ,
27
25
merge : bool = False ,
26
+ verbose : bool = False ,
28
27
) -> None :
29
28
"""Uploads a batch of inferences to the Openlayer platform."""
30
29
if dataset_df is None and dataset_path is None :
@@ -33,7 +32,7 @@ def upload_batch_inferences(
33
32
raise ValueError ("Only one of dataset_df or dataset_path should be provided." )
34
33
35
34
uploader = _upload .Uploader (client , storage_type )
36
- object_name = f"batch_data_{ time .time ()} _{ inference_pipeline_id } .tar.gz "
35
+ object_name = f"batch_data_{ time .time ()} _{ inference_pipeline_id } .arrow "
37
36
38
37
# Fetch presigned url
39
38
presigned_url_response = client .storage .presigned_url .create (
@@ -42,26 +41,34 @@ def upload_batch_inferences(
42
41
43
42
# Write dataset and config to temp directory
44
43
with tempfile .TemporaryDirectory () as tmp_dir :
45
- temp_file_path = f"{ tmp_dir } /dataset.csv"
44
+ # If DataFrame is provided, convert it to Arrow Table and write it using IPC
45
+ # writer
46
46
if dataset_df is not None :
47
- dataset_df .to_csv (temp_file_path , index = False )
48
- else :
49
- shutil .copy (dataset_path , temp_file_path )
47
+ temp_file_path = f"{ tmp_dir } /dataset.arrow"
48
+ if verbose :
49
+ print ("Converting DataFrame to pyarrow Table..." )
50
+ pa_table = pa .Table .from_pandas (dataset_df )
51
+ pa_schema = pa_table .schema
50
52
51
- # Copy relevant files to tmp dir
52
- config ["label" ] = "production"
53
- utils .write_yaml (
54
- maybe_transform (config , data_stream_params .Config ),
55
- f"{ tmp_dir } /dataset_config.yaml" ,
56
- )
53
+ if verbose :
54
+ print (
55
+ "Writing Arrow Table using RecordBatchStreamWriter to "
56
+ f"{ temp_file_path } "
57
+ )
58
+ with pa .ipc .RecordBatchStreamWriter (temp_file_path , pa_schema ) as writer :
59
+ writer .write_table (pa_table , max_chunksize = 16384 )
60
+ else :
61
+ object_name = f"batch_data_{ time .time ()} _{ inference_pipeline_id } .csv"
62
+ temp_file_path = dataset_path
57
63
58
- tar_file_path = os .path .join (tmp_dir , object_name )
59
- with tarfile .open (tar_file_path , mode = "w:gz" ) as tar :
60
- tar .add (tmp_dir , arcname = os .path .basename ("monitoring_data" ))
64
+ # camelCase the config
65
+ config = maybe_transform (config , data_stream_params .Config )
61
66
62
- # Upload to storage
67
+ # Upload tarball to storage
68
+ if verbose :
69
+ print ("Uploading dataset to storage via presigned URL..." )
63
70
uploader .upload (
64
- file_path = tar_file_path ,
71
+ file_path = temp_file_path ,
65
72
object_name = object_name ,
66
73
presigned_url_response = presigned_url_response ,
67
74
)
@@ -73,10 +80,35 @@ def upload_batch_inferences(
73
80
body = {
74
81
"storageUri" : presigned_url_response .storage_uri ,
75
82
"performDataMerge" : merge ,
83
+ "config" : config ,
76
84
},
77
85
)
78
86
79
87
88
+ def upload_batch_inferences (
89
+ client : Openlayer ,
90
+ inference_pipeline_id : str ,
91
+ config : data_stream_params .Config ,
92
+ dataset_df : Optional [pd .DataFrame ] = None ,
93
+ dataset_path : Optional [str ] = None ,
94
+ storage_type : Optional [StorageType ] = None ,
95
+ merge : bool = False ,
96
+ verbose : bool = False ,
97
+ ) -> None :
98
+ asyncio .run (
99
+ upload_batch_inferences_async (
100
+ client ,
101
+ inference_pipeline_id ,
102
+ config ,
103
+ dataset_df ,
104
+ dataset_path ,
105
+ storage_type ,
106
+ merge ,
107
+ verbose ,
108
+ )
109
+ )
110
+
111
+
80
112
def update_batch_inferences (
81
113
client : Openlayer ,
82
114
inference_pipeline_id : str ,
0 commit comments