minhvtt commited on
Commit
757838b
·
verified ·
1 Parent(s): f056202

Update qdrant_service.py

Browse files
Files changed (1) hide show
  1. qdrant_service.py +137 -20
qdrant_service.py CHANGED
@@ -106,26 +106,52 @@ class QdrantVectorService:
106
  else:
107
  print("✓ Collection already exists")
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def index_data(
110
  self,
111
  doc_id: str,
112
  embedding: np.ndarray,
113
  metadata: Dict[str, Any]
114
- ) -> str:
115
  """
116
  Index data vào Qdrant
117
 
118
  Args:
119
- doc_id: ID của document (event/social media post)
120
  embedding: Vector embedding từ Jina CLIP
121
  metadata: Metadata (text, image_url, event_info, etc.)
122
 
123
  Returns:
124
- ID của point đã index
125
  """
126
- # Generate UUID nếu không ID
127
- if not doc_id:
128
- doc_id = str(uuid.uuid4())
 
 
129
 
130
  # Ensure embedding là 1D array
131
  if len(embedding.shape) > 1:
@@ -133,7 +159,7 @@ class QdrantVectorService:
133
 
134
  # Create point
135
  point = PointStruct(
136
- id=doc_id,
137
  vector=embedding.tolist(),
138
  payload=metadata
139
  )
@@ -144,41 +170,53 @@ class QdrantVectorService:
144
  points=[point]
145
  )
146
 
147
- return doc_id
 
 
 
148
 
149
  def batch_index(
150
  self,
151
  doc_ids: List[str],
152
  embeddings: np.ndarray,
153
  metadata_list: List[Dict[str, Any]]
154
- ) -> List[str]:
155
  """
156
  Batch index nhiều documents cùng lúc
157
 
158
  Args:
159
- doc_ids: List of document IDs
160
  embeddings: Numpy array of embeddings (n_samples, embedding_dim)
161
  metadata_list: List of metadata dicts
162
 
163
  Returns:
164
- List of indexed IDs
165
  """
166
  points = []
 
167
 
168
  for i, (doc_id, embedding, metadata) in enumerate(zip(doc_ids, embeddings, metadata_list)):
169
- if not doc_id:
170
- doc_id = str(uuid.uuid4())
 
 
 
171
 
172
  # Ensure embedding là 1D
173
  if len(embedding.shape) > 1:
174
  embedding = embedding.flatten()
175
 
176
  points.append(PointStruct(
177
- id=doc_id,
178
  vector=embedding.tolist(),
179
  payload=metadata
180
  ))
181
 
 
 
 
 
 
182
  # Batch upsert
183
  self.client.upsert(
184
  collection_name=self.collection_name,
@@ -186,7 +224,7 @@ class QdrantVectorService:
186
  wait=True # Wait for indexing to complete
187
  )
188
 
189
- return doc_ids
190
 
191
  def search(
192
  self,
@@ -233,11 +271,15 @@ class QdrantVectorService:
233
  with_vectors=False # Không cần return vectors
234
  )
235
 
236
- # Format results
237
  results = []
238
  for hit in search_result:
 
 
 
239
  results.append({
240
- "id": hit.id,
 
241
  "confidence": float(hit.score), # Cosine similarity score
242
  "metadata": hit.payload
243
  })
@@ -297,20 +339,95 @@ class QdrantVectorService:
297
 
298
  def delete_by_id(self, doc_id: str) -> bool:
299
  """
300
- Delete document by ID
301
 
302
  Args:
303
- doc_id: Document ID to delete
304
 
305
  Returns:
306
  Success status
307
  """
 
 
 
308
  self.client.delete(
309
  collection_name=self.collection_name,
310
- points_selector=[doc_id]
311
  )
312
  return True
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  def get_collection_info(self) -> Dict[str, Any]:
315
  """
316
  Lấy thông tin collection
 
106
  else:
107
  print("✓ Collection already exists")
108
 
109
+ def _convert_to_valid_id(self, doc_id: str) -> str:
110
+ """
111
+ Convert bất kỳ string ID nào thành UUID hợp lệ cho Qdrant
112
+
113
+ Args:
114
+ doc_id: Original ID (có thể là MongoDB ObjectId, string, etc.)
115
+
116
+ Returns:
117
+ UUID string hợp lệ
118
+ """
119
+ if not doc_id:
120
+ return str(uuid.uuid4())
121
+
122
+ # Nếu đã là UUID hợp lệ, giữ nguyên
123
+ try:
124
+ uuid.UUID(doc_id)
125
+ return doc_id
126
+ except ValueError:
127
+ pass
128
+
129
+ # Convert string sang UUID deterministic (cùng input = cùng UUID)
130
+ # Sử dụng UUID v5 với namespace DNS
131
+ return str(uuid.uuid5(uuid.NAMESPACE_DNS, doc_id))
132
+
133
  def index_data(
134
  self,
135
  doc_id: str,
136
  embedding: np.ndarray,
137
  metadata: Dict[str, Any]
138
+ ) -> Dict[str, str]:
139
  """
140
  Index data vào Qdrant
141
 
142
  Args:
143
+ doc_id: ID của document (MongoDB ObjectId, string, etc.)
144
  embedding: Vector embedding từ Jina CLIP
145
  metadata: Metadata (text, image_url, event_info, etc.)
146
 
147
  Returns:
148
+ Dict với original_id qdrant_id
149
  """
150
+ # Convert ID thành UUID hợp lệ
151
+ qdrant_id = self._convert_to_valid_id(doc_id)
152
+
153
+ # Lưu original ID vào metadata
154
+ metadata['original_id'] = doc_id
155
 
156
  # Ensure embedding là 1D array
157
  if len(embedding.shape) > 1:
 
159
 
160
  # Create point
161
  point = PointStruct(
162
+ id=qdrant_id,
163
  vector=embedding.tolist(),
164
  payload=metadata
165
  )
 
170
  points=[point]
171
  )
172
 
173
+ return {
174
+ "original_id": doc_id,
175
+ "qdrant_id": qdrant_id
176
+ }
177
 
178
  def batch_index(
179
  self,
180
  doc_ids: List[str],
181
  embeddings: np.ndarray,
182
  metadata_list: List[Dict[str, Any]]
183
+ ) -> List[Dict[str, str]]:
184
  """
185
  Batch index nhiều documents cùng lúc
186
 
187
  Args:
188
+ doc_ids: List of document IDs (MongoDB ObjectId, string, etc.)
189
  embeddings: Numpy array of embeddings (n_samples, embedding_dim)
190
  metadata_list: List of metadata dicts
191
 
192
  Returns:
193
+ List of dicts với original_id và qdrant_id
194
  """
195
  points = []
196
+ id_mappings = []
197
 
198
  for i, (doc_id, embedding, metadata) in enumerate(zip(doc_ids, embeddings, metadata_list)):
199
+ # Convert to valid UUID
200
+ qdrant_id = self._convert_to_valid_id(doc_id)
201
+
202
+ # Lưu original ID vào metadata
203
+ metadata['original_id'] = doc_id
204
 
205
  # Ensure embedding là 1D
206
  if len(embedding.shape) > 1:
207
  embedding = embedding.flatten()
208
 
209
  points.append(PointStruct(
210
+ id=qdrant_id,
211
  vector=embedding.tolist(),
212
  payload=metadata
213
  ))
214
 
215
+ id_mappings.append({
216
+ "original_id": doc_id,
217
+ "qdrant_id": qdrant_id
218
+ })
219
+
220
  # Batch upsert
221
  self.client.upsert(
222
  collection_name=self.collection_name,
 
224
  wait=True # Wait for indexing to complete
225
  )
226
 
227
+ return id_mappings
228
 
229
  def search(
230
  self,
 
271
  with_vectors=False # Không cần return vectors
272
  )
273
 
274
+ # Format results - trả về original_id thay vì UUID
275
  results = []
276
  for hit in search_result:
277
+ # Lấy original_id từ metadata (MongoDB ObjectId)
278
+ original_id = hit.payload.get('original_id', hit.id)
279
+
280
  results.append({
281
+ "id": original_id, # Trả về MongoDB ObjectId
282
+ "qdrant_id": hit.id, # UUID trong Qdrant
283
  "confidence": float(hit.score), # Cosine similarity score
284
  "metadata": hit.payload
285
  })
 
339
 
340
  def delete_by_id(self, doc_id: str) -> bool:
341
  """
342
+ Delete document by ID (hỗ trợ cả MongoDB ObjectId và UUID)
343
 
344
  Args:
345
+ doc_id: Document ID to delete (MongoDB ObjectId hoặc UUID)
346
 
347
  Returns:
348
  Success status
349
  """
350
+ # Convert to UUID nếu là MongoDB ObjectId
351
+ qdrant_id = self._convert_to_valid_id(doc_id)
352
+
353
  self.client.delete(
354
  collection_name=self.collection_name,
355
+ points_selector=[qdrant_id]
356
  )
357
  return True
358
 
359
+ def get_by_id(self, doc_id: str) -> Optional[Dict[str, Any]]:
360
+ """
361
+ Get document by ID (hỗ trợ cả MongoDB ObjectId và UUID)
362
+
363
+ Args:
364
+ doc_id: Document ID (MongoDB ObjectId hoặc UUID)
365
+
366
+ Returns:
367
+ Document data hoặc None nếu không tìm thấy
368
+ """
369
+ # Convert to UUID nếu là MongoDB ObjectId
370
+ qdrant_id = self._convert_to_valid_id(doc_id)
371
+
372
+ try:
373
+ result = self.client.retrieve(
374
+ collection_name=self.collection_name,
375
+ ids=[qdrant_id],
376
+ with_payload=True,
377
+ with_vectors=False
378
+ )
379
+
380
+ if result:
381
+ point = result[0]
382
+ original_id = point.payload.get('original_id', point.id)
383
+ return {
384
+ "id": original_id, # MongoDB ObjectId
385
+ "qdrant_id": point.id, # UUID trong Qdrant
386
+ "metadata": point.payload
387
+ }
388
+ return None
389
+ except Exception as e:
390
+ print(f"Error retrieving document: {e}")
391
+ return None
392
+
393
+ def search_by_metadata(
394
+ self,
395
+ filter_conditions: Dict,
396
+ limit: int = 100
397
+ ) -> List[Dict[str, Any]]:
398
+ """
399
+ Search documents by metadata conditions (không cần embedding)
400
+
401
+ Args:
402
+ filter_conditions: Qdrant filter conditions
403
+ limit: Maximum số results
404
+
405
+ Returns:
406
+ List of matching documents
407
+ """
408
+ try:
409
+ result = self.client.scroll(
410
+ collection_name=self.collection_name,
411
+ scroll_filter=filter_conditions,
412
+ limit=limit,
413
+ with_payload=True,
414
+ with_vectors=False
415
+ )
416
+
417
+ documents = []
418
+ for point in result[0]: # result is tuple (points, next_page_offset)
419
+ original_id = point.payload.get('original_id', point.id)
420
+ documents.append({
421
+ "id": original_id, # MongoDB ObjectId
422
+ "qdrant_id": point.id, # UUID trong Qdrant
423
+ "metadata": point.payload
424
+ })
425
+
426
+ return documents
427
+ except Exception as e:
428
+ print(f"Error searching by metadata: {e}")
429
+ return []
430
+
431
  def get_collection_info(self) -> Dict[str, Any]:
432
  """
433
  Lấy thông tin collection