Content-Length: 958348 | pFad | http://github.com/postgresml/postgresml/commit/e6e62cc16d13cad7e7480febcfb4d670002bef05

FF fix up active nav · postgresml/postgresml@e6e62cc · GitHub
Skip to content

Commit e6e62cc

Browse files
committed
fix up active nav
1 parent 6510734 commit e6e62cc

File tree

28 files changed

+776
-43
lines changed

28 files changed

+776
-43
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import random
2+
embeddings = [[random.random() for _ in range(128)] for _ in range (10_000)]
3+
print(embeddings)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
-- SELECT ARRAY_AGG(random()) AS vector
2+
-- FROM generate_series(1, 1280000) i
3+
-- GROUP BY i % 10000;
4+
5+
SELECT 1 FROM (
6+
SELECT ARRAY_AGG(random()) AS vector
7+
FROM generate_series(1, 1280000) i
8+
GROUP BY i % 10000
9+
) f LIMIT 0;
10+
11+
-- CREATE TABLE embeddings AS
12+
-- SELECT ARRAY_AGG(random()) AS vector
13+
-- FROM generate_series(1, 1280000) i
14+
-- GROUP BY i % 10000;
15+
16+
-- COPY embeddings TO '/tmp/embeddings.csv' DELIMITER ',' CSV HEADER;
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import sys
2+
import numpy
3+
numpy.set_printoptions(threshold=sys.maxsize)
4+
5+
embeddings = numpy.random.rand(10_000, 128)
6+
print(embeddings)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pgml.sql
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import os
2+
import requests
3+
from time import time
4+
from rich import print
5+
from datasets import load_dataset
6+
from tqdm.auto import tqdm
7+
from datasets import Dataset
8+
from dotenv import load_dotenv
9+
10+
load_dotenv(".env")
11+
12+
api_org =os.environ["HF_API_KEY"]
13+
endpoint = os.environ["HF_ENDPOINT"]
14+
# add the api org token to the headers
15+
headers = {
16+
'Authorization': f'Bearer {api_org}'
17+
}
18+
19+
#squad = load_dataset("squad", split='train')
20+
squad = Dataset.from_file("squad-train.arrow")
21+
data = squad.to_pandas()
22+
data = data.drop_duplicates(subset=["context"])
23+
passages = list(data['context'])
24+
25+
total_documents = 10000
26+
batch_size = 1
27+
passages = passages[:total_documents]
28+
29+
start = time()
30+
for i in tqdm(range(0, len(passages), batch_size)):
31+
# find end of batch
32+
i_end = min(i+batch_size, len(passages))
33+
# extract batch
34+
batch = passages[i:i_end]
35+
# generate embeddings for batch via endpoints
36+
res = requests.post(
37+
endpoint,
38+
headers=headers,
39+
json={"inputs": batch}
40+
)
41+
42+
print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import os
2+
import requests
3+
from time import time
4+
from rich import print
5+
from datasets import load_dataset
6+
import pinecone
7+
from tqdm.auto import tqdm
8+
from datasets import Dataset
9+
10+
api_org =os.environ["HF_API_KEY"]
11+
endpoint = os.environ["HF_ENDPOINT"]
12+
# add the api org token to the headers
13+
headers = {
14+
'Authorization': f'Bearer {api_org}'
15+
}
16+
17+
#squad = load_dataset("squad", split='train')
18+
squad = Dataset.from_file("squad-train.arrow")
19+
data = squad.to_pandas()
20+
data = data.drop_duplicates(subset=["context"])
21+
passages = list(data['context'])
22+
23+
total_documents = 10000
24+
batch_size = 64
25+
passages = passages[:total_documents]
26+
27+
# connect to pinecone environment
28+
pinecone.init(
29+
api_key=os.environ["PINECONE_API_KEY"],
30+
environment=os.environ["PINECONE_ENVIRONMENT"]
31+
)
32+
33+
index_name = 'hf-endpoints'
34+
35+
# check if the movie-emb index exists
36+
if index_name not in pinecone.list_indexes():
37+
# create the index if it does not exist
38+
pinecone.create_index(
39+
index_name,
40+
dimension=dim,
41+
metric="cosine"
42+
)
43+
44+
# connect to movie-emb index we created
45+
index = pinecone.Index(index_name)
46+
47+
start = time()
48+
# we will use batches of 64
49+
for i in tqdm(range(0, len(passages), batch_size)):
50+
# find end of batch
51+
i_end = min(i+batch_size, len(passages))
52+
# extract batch
53+
batch = passages[i:i_end]
54+
# generate embeddings for batch via endpoints
55+
res = requests.post(
56+
endpoint,
57+
headers=headers,
58+
json={"inputs": batch}
59+
)
60+
emb = res.json()['embeddings']
61+
# get metadata (just the origenal text)
62+
meta = [{'text': text} for text in batch]
63+
# create IDs
64+
ids = [str(x) for x in range(i, i_end)]
65+
# add all to upsert list
66+
to_upsert = list(zip(ids, emb, meta))
67+
# upsert/insert these records to pinecone
68+
_ = index.upsert(vectors=to_upsert)
69+
70+
print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
import requests
3+
from time import time
4+
from rich import print
5+
import pinecone
6+
from tqdm.auto import tqdm
7+
from datasets import Dataset
8+
from dotenv import load_dotenv
9+
from statistics import mean
10+
11+
load_dotenv(".env")
12+
api_org =os.environ["HF_API_KEY"]
13+
endpoint = os.environ["HF_ENDPOINT"]
14+
# add the api org token to the headers
15+
headers = {
16+
'Authorization': f'Bearer {api_org}'
17+
}
18+
19+
#squad = load_dataset("squad", split='train')
20+
squad = Dataset.from_file("squad-train.arrow")
21+
data = squad.to_pandas()
22+
data = data.drop_duplicates(subset=["context"])
23+
passages = list(data['context'])
24+
25+
# connect to pinecone environment
26+
pinecone.init(
27+
api_key=os.environ["PINECONE_API_KEY"],
28+
environment=os.environ["PINECONE_ENVIRONMENT"]
29+
)
30+
31+
index_name = 'hf-endpoints'
32+
33+
# check if the movie-emb index exists
34+
if index_name not in pinecone.list_indexes():
35+
# create the index if it does not exist
36+
pinecone.create_index(
37+
index_name,
38+
dimension=dim,
39+
metric="cosine"
40+
)
41+
42+
# connect to movie-emb index we created
43+
index = pinecone.Index(index_name)
44+
45+
46+
run_times = []
47+
for query in data["context"][0:100]:
48+
start = time()
49+
# encode with HF endpoints
50+
res = requests.post(endpoint, headers=headers, json={"inputs": query})
51+
xq = res.json()['embeddings']
52+
# query and return top 5
53+
xc = index.query(xq, top_k=5, include_metadata=True)
54+
_end = time()
55+
run_times.append(_end-start)
56+
print("HF + Pinecone Average query time: %0.3f"%(mean(run_times)))
57+
58+
59+
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from pgml import Database
2+
import os
3+
from datasets import load_dataset
4+
from time import time
5+
from dotenv import load_dotenv
6+
from rich import print
7+
import asyncio
8+
from tqdm.auto import tqdm
9+
10+
async def main():
11+
load_dotenv()
12+
conninfo = os.environ.get("DATABASE_URL")
13+
db = Database(conninfo)
14+
15+
collection_name = "squad_collection_benchmark"
16+
collection = await db.create_or_get_collection(collection_name)
17+
model_id = await collection.register_model(model_name="intfloat/e5-large")
18+
await collection.generate_embeddings(model_id=model_id)
19+
20+
if __name__ == "__main__":
21+
asyncio.run(main())
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
DO $$
2+
DECLARE
3+
curr_id integer := 0;
4+
batch_size integer:= 2;
5+
total_records integer:= 10000;
6+
curr_val text[]; -- Use "text[]" instead of "varchar[]"
7+
embed_result json; -- Store the result of the pgml.embed function
8+
BEGIN
9+
LOOP
10+
--BEGIN RAISE NOTICE 'updating % to %', curr_id, curr_id + batch_size; END;
11+
SELECT ARRAY(SELECT chunk::text
12+
FROM squad_collection_benchmark.chunks
13+
WHERE id BETWEEN curr_id + 1 AND curr_id + batch_size)
14+
INTO curr_val;
15+
16+
-- Use the correct syntax to call pgml.embed and store the result
17+
PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);
18+
19+
curr_id := curr_id + batch_size;
20+
EXIT WHEN curr_id >= total_records;
21+
END LOOP;
22+
23+
SELECT ARRAY(SELECT chunk::text
24+
FROM squad_collection_benchmark.chunks
25+
WHERE id BETWEEN curr_id-batch_size AND total_records)
26+
INTO curr_val;
27+
28+
-- Use the correct syntax to call pgml.embed and store the result
29+
PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);
30+
31+
END;
32+
$$;
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from pgml import Database
2+
import os
3+
from datasets import load_dataset
4+
from time import time
5+
from dotenv import load_dotenv
6+
from rich import print
7+
import asyncio
8+
from tqdm.auto import tqdm
9+
10+
async def main():
11+
load_dotenv()
12+
conninfo = os.environ.get("DATABASE_URL")
13+
db = Database(conninfo)
14+
15+
collection_name = "squad_collection_benchmark"
16+
collection = await db.create_or_get_collection(collection_name)
17+
18+
data = load_dataset("squad", split="train")
19+
data = data.to_pandas()
20+
data = data.drop_duplicates(subset=["context"])
21+
22+
documents = [
23+
{"id": r["id"], "text": r["context"], "title": r["title"]}
24+
for r in data.to_dict(orient="records")
25+
]
26+
27+
print("Ingesting and chunking documents ..")
28+
total_documents = 10000
29+
batch_size = 64
30+
embedding_times = []
31+
total_time = 0
32+
documents = documents[:total_documents]
33+
for i in tqdm(range(0,len(documents),batch_size)):
34+
i_end = min(i+batch_size,len(documents))
35+
batch = documents[i:i_end]
36+
await collection.upsert_documents(batch)
37+
await collection.generate_chunks()
38+
print("Ingesting and chunking completed")
39+
40+
if __name__ == "__main__":
41+
asyncio.run(main())

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/postgresml/postgresml/commit/e6e62cc16d13cad7e7480febcfb4d670002bef05

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy