stacklok
diff --git a/Diff for: ‎data/archived.jsonl
+9,309 b/Diff for: ‎data/archived.jsonl
+9,309
diff --git a/Diff for: ‎data/deprecated.jsonl
+31,572 b/Diff for: ‎data/deprecated.jsonl
+31,572
diff --git a/Diff for: ‎data/malicious.jsonl
+25,480 b/Diff for: ‎data/malicious.jsonl
+25,480
diff --git a/Diff for: ‎requirements.txt
+4 b/Diff for: ‎requirements.txt
+4
diff --git a/Diff for: ‎scripts/import_packages.py
+75 b/Diff for: ‎scripts/import_packages.py
+75
diff --git a/Diff for: ‎utils/__init__.py b/Diff for: ‎utils/__init__.py
diff --git a/Diff for: ‎utils/embedding_util.py
+40 b/Diff for: ‎utils/embedding_util.py
+40
@@ -0,0 +1,4 @@
+weaviate==0.1.2
+weaviate-client==4.9.3
+torch==2.5.1
+transformers==4.46.3
@@ -0,0 +1,75 @@
+import json
+from utils.embedding_util import generate_embeddings
+import weaviate
+from weaviate.embedded import EmbeddedOptions
+from weaviate.classes.config import Property, DataType
+
+
+json_files = [
+    'data/archived.jsonl',
+    'data/deprecated.jsonl',
+    'data/malicious.jsonl',
+]
+
+
+def setup_schema(client):
+    if client.collections.exists("Package"):
+        client.collections.delete("Package")
+    client.collections.create(
+        "Package",
+        properties=[
+            Property(name="name", data_type=DataType.TEXT),
+            Property(name="type", data_type=DataType.TEXT),
+            Property(name="status", data_type=DataType.TEXT),
+            Property(name="description", data_type=DataType.TEXT),
+        ]
+    )
+
+
+def add_data(client):
+    collection = client.collections.get("Package")
+
+    for json_file in json_files:
+        with open(json_file, 'r') as f:
+            print("Adding data from", json_file)
+            counter = 0
+            with collection.batch.dynamic() as batch:
+                for line in f:
+                    package = json.loads(line)
+                    counter += 1
+                    if counter > 100:
+                        break
+
+                    # prepare the object for embedding
+                    vector_str = f"{package['name']} {package['description']}"
+                    vector = generate_embeddings(vector_str)
+
+                    # now add the status column
+                    if 'archived' in json_file:
+                        package['status'] = 'archived'
+                    elif 'deprecated' in json_file:
+                        package['status'] = 'deprecated'
+                    elif 'malicious' in json_file:
+                        package['status'] = 'malicious'
+                    else:
+                        package['status'] = 'unknown'
+
+                    batch.add_object(properties=package, vector=vector)
+
+
+def run_import():
+    client = weaviate.WeaviateClient(
+        embedded_options=EmbeddedOptions(
+            persistence_data_path="./weaviate_data"
+        ),
+    )
+    with client:
+        client.connect()
+        print('is_ready:', client.is_ready())
+
+        setup_schema(client)
+        add_data(client)
+
+
+if __name__ == '__main__':
+    run_import()
@@ -0,0 +1,40 @@
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+import os
+import warnings
+
+# The transformers library internally is creating this warning, but does not
+# impact our app. Safe to ignore.
+warnings.filterwarnings(action='ignore', category=ResourceWarning)
+
+
+# We won't have competing threads in this example app
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+# Initialize tokenizer and model for GTE-base
+tokenizer = AutoTokenizer.from_pretrained('thenlper/gte-base')
+model = AutoModel.from_pretrained('thenlper/gte-base')
+
+
+def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(
+        ~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+
+def generate_embeddings(text):
+    inputs = tokenizer(text, return_tensors='pt',
+                       max_length=512, truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+
+    attention_mask = inputs['attention_mask']
+    embeddings = average_pool(outputs.last_hidden_state, attention_mask)
+
+    # (Optionally) normalize embeddings
+    embeddings = F.normalize(embeddings, p=2, dim=1)
+
+    return embeddings.numpy().tolist()[0]