|
14 | 14 |
|
15 | 15 | from feast import FeatureStore |
16 | 16 | from feast.entity import Entity |
17 | | -from feast.errors import FeatureNameCollisionError |
| 17 | +from feast.errors import FeatureNameCollisionError, FeatureViewNotFoundException |
18 | 18 | from feast.feature_service import FeatureService |
19 | 19 | from feast.feature_view import FeatureView |
20 | 20 | from feast.field import Field |
| 21 | +from feast.filter_models import ComparisonFilter, CompoundFilter |
21 | 22 | from feast.infra.offline_stores.file_source import FileSource |
22 | 23 | from feast.infra.utils.postgres.postgres_config import ConnectionType |
23 | 24 | from feast.online_response import TIMESTAMP_POSTFIX |
| 25 | +from feast.repo_config import EmbeddingModelConfig |
24 | 26 | from feast.types import ( |
25 | 27 | Array, |
26 | 28 | Float32, |
@@ -1265,3 +1267,313 @@ def test_retrieve_online_documents_v2(environment, fake_document_data): |
1265 | 1267 | assert len(no_match_results["text_field"]) == 0 |
1266 | 1268 | assert "text_rank" in no_match_results |
1267 | 1269 | assert len(no_match_results["text_rank"]) == 0 |
| 1270 | + |
| 1271 | + |
| 1272 | +def _setup_documents_with_categories(fs): |
| 1273 | + """Shared helper that creates and populates a feature view with embeddings, |
| 1274 | + text, and category fields. Returns (feature_view, entity, dataframe).""" |
| 1275 | + n_rows = 20 |
| 1276 | + vector_dim = 2 |
| 1277 | + random.seed(42) |
| 1278 | + |
| 1279 | + df = pd.DataFrame( |
| 1280 | + { |
| 1281 | + "item_id": list(range(n_rows)), |
| 1282 | + "embedding": [list(np.random.random(vector_dim)) for _ in range(n_rows)], |
| 1283 | + "text_field": [ |
| 1284 | + f"Document text content {i} with searchable keywords" |
| 1285 | + for i in range(n_rows) |
| 1286 | + ], |
| 1287 | + "category": [f"Category-{i % 5}" for i in range(n_rows)], |
| 1288 | + "event_timestamp": [datetime.now() for _ in range(n_rows)], |
| 1289 | + } |
| 1290 | + ) |
| 1291 | + |
| 1292 | + data_source = FileSource( |
| 1293 | + path="dummy_path.parquet", timestamp_field="event_timestamp" |
| 1294 | + ) |
| 1295 | + |
| 1296 | + item_entity = Entity( |
| 1297 | + name="item_id", |
| 1298 | + join_keys=["item_id"], |
| 1299 | + value_type=ValueType.INT64, |
| 1300 | + ) |
| 1301 | + |
| 1302 | + item_embeddings_fv = FeatureView( |
| 1303 | + name="item_embeddings", |
| 1304 | + entities=[item_entity], |
| 1305 | + schema=[ |
| 1306 | + Field(name="embedding", dtype=Array(Float32), vector_index=True), |
| 1307 | + Field(name="text_field", dtype=String), |
| 1308 | + Field(name="category", dtype=String), |
| 1309 | + Field(name="item_id", dtype=Int64), |
| 1310 | + ], |
| 1311 | + source=data_source, |
| 1312 | + ) |
| 1313 | + |
| 1314 | + fs.apply([item_embeddings_fv, item_entity]) |
| 1315 | + fs.write_to_online_store("item_embeddings", df) |
| 1316 | + return item_embeddings_fv, item_entity, df |
| 1317 | + |
| 1318 | + |
| 1319 | +@pytest.mark.integration |
| 1320 | +@pytest.mark.universal_online_stores(only=["pgvector", "elasticsearch"]) |
| 1321 | +def test_retrieve_online_documents_v2_with_filters(environment, fake_document_data): |
| 1322 | + """Test that metadata filters narrow down vector/text search results.""" |
| 1323 | + fs = environment.feature_store |
| 1324 | + fs.config.online_store.vector_enabled = True |
| 1325 | + |
| 1326 | + _, _, df = _setup_documents_with_categories(fs) |
| 1327 | + vector_dim = 2 |
| 1328 | + query_embedding = list(np.random.random(vector_dim)) |
| 1329 | + |
| 1330 | + # --- eq filter: only Category-0 rows --- |
| 1331 | + eq_filter = ComparisonFilter(type="eq", key="category", value="Category-0") |
| 1332 | + results = fs.retrieve_online_documents_v2( |
| 1333 | + features=[ |
| 1334 | + "item_embeddings:embedding", |
| 1335 | + "item_embeddings:text_field", |
| 1336 | + "item_embeddings:category", |
| 1337 | + "item_embeddings:item_id", |
| 1338 | + ], |
| 1339 | + query=query_embedding, |
| 1340 | + top_k=10, |
| 1341 | + distance_metric="L2", |
| 1342 | + filters=eq_filter, |
| 1343 | + ).to_dict() |
| 1344 | + |
| 1345 | + assert len(results["category"]) > 0 |
| 1346 | + assert len(results["category"]) <= 4 # 20 rows / 5 categories |
| 1347 | + assert all(c == "Category-0" for c in results["category"]) |
| 1348 | + |
| 1349 | + # --- ne filter: exclude Category-0 --- |
| 1350 | + ne_filter = ComparisonFilter(type="ne", key="category", value="Category-0") |
| 1351 | + results = fs.retrieve_online_documents_v2( |
| 1352 | + features=[ |
| 1353 | + "item_embeddings:embedding", |
| 1354 | + "item_embeddings:text_field", |
| 1355 | + "item_embeddings:category", |
| 1356 | + "item_embeddings:item_id", |
| 1357 | + ], |
| 1358 | + query=query_embedding, |
| 1359 | + top_k=10, |
| 1360 | + distance_metric="L2", |
| 1361 | + filters=ne_filter, |
| 1362 | + ).to_dict() |
| 1363 | + |
| 1364 | + assert len(results["category"]) > 0 |
| 1365 | + assert all(c != "Category-0" for c in results["category"]) |
| 1366 | + |
| 1367 | + # --- in filter: Category-0 or Category-1 --- |
| 1368 | + in_filter = ComparisonFilter( |
| 1369 | + type="in", key="category", value=["Category-0", "Category-1"] |
| 1370 | + ) |
| 1371 | + results = fs.retrieve_online_documents_v2( |
| 1372 | + features=[ |
| 1373 | + "item_embeddings:embedding", |
| 1374 | + "item_embeddings:text_field", |
| 1375 | + "item_embeddings:category", |
| 1376 | + "item_embeddings:item_id", |
| 1377 | + ], |
| 1378 | + query=query_embedding, |
| 1379 | + top_k=10, |
| 1380 | + distance_metric="L2", |
| 1381 | + filters=in_filter, |
| 1382 | + ).to_dict() |
| 1383 | + |
| 1384 | + assert len(results["category"]) > 0 |
| 1385 | + assert all(c in ("Category-0", "Category-1") for c in results["category"]) |
| 1386 | + |
| 1387 | + # --- compound AND filter: category == Category-0 AND item_id >= 5 --- |
| 1388 | + and_filter = CompoundFilter( |
| 1389 | + type="and", |
| 1390 | + filters=[ |
| 1391 | + ComparisonFilter(type="eq", key="category", value="Category-0"), |
| 1392 | + ComparisonFilter(type="gte", key="item_id", value=5), |
| 1393 | + ], |
| 1394 | + ) |
| 1395 | + results = fs.retrieve_online_documents_v2( |
| 1396 | + features=[ |
| 1397 | + "item_embeddings:embedding", |
| 1398 | + "item_embeddings:text_field", |
| 1399 | + "item_embeddings:category", |
| 1400 | + "item_embeddings:item_id", |
| 1401 | + ], |
| 1402 | + query=query_embedding, |
| 1403 | + top_k=10, |
| 1404 | + distance_metric="L2", |
| 1405 | + filters=and_filter, |
| 1406 | + ).to_dict() |
| 1407 | + |
| 1408 | + assert len(results["category"]) > 0 |
| 1409 | + assert all(c == "Category-0" for c in results["category"]) |
| 1410 | + assert all(i >= 5 for i in results["item_id"]) |
| 1411 | + |
| 1412 | + # --- text search + filter --- |
| 1413 | + text_filter = ComparisonFilter(type="eq", key="category", value="Category-2") |
| 1414 | + text_results = fs.retrieve_online_documents_v2( |
| 1415 | + features=[ |
| 1416 | + "item_embeddings:embedding", |
| 1417 | + "item_embeddings:text_field", |
| 1418 | + "item_embeddings:category", |
| 1419 | + "item_embeddings:item_id", |
| 1420 | + ], |
| 1421 | + query_string="searchable keywords", |
| 1422 | + top_k=10, |
| 1423 | + filters=text_filter, |
| 1424 | + ).to_dict() |
| 1425 | + |
| 1426 | + assert len(text_results["category"]) > 0 |
| 1427 | + assert all(c == "Category-2" for c in text_results["category"]) |
| 1428 | + |
| 1429 | + # --- filter with no matches --- |
| 1430 | + empty_filter = ComparisonFilter( |
| 1431 | + type="eq", key="category", value="NonexistentCategory" |
| 1432 | + ) |
| 1433 | + empty_results = fs.retrieve_online_documents_v2( |
| 1434 | + features=[ |
| 1435 | + "item_embeddings:embedding", |
| 1436 | + "item_embeddings:text_field", |
| 1437 | + "item_embeddings:category", |
| 1438 | + "item_embeddings:item_id", |
| 1439 | + ], |
| 1440 | + query=query_embedding, |
| 1441 | + top_k=10, |
| 1442 | + distance_metric="L2", |
| 1443 | + filters=empty_filter, |
| 1444 | + ).to_dict() |
| 1445 | + |
| 1446 | + assert len(empty_results.get("category", [])) == 0 |
| 1447 | + |
| 1448 | + |
| 1449 | +@pytest.mark.integration |
| 1450 | +@pytest.mark.universal_online_stores(only=["pgvector", "elasticsearch"]) |
| 1451 | +def test_retrieve_online_documents_openai(environment, fake_document_data): |
| 1452 | + """Test OpenAI-compatible vector store search returns the correct response shape.""" |
| 1453 | + fs = environment.feature_store |
| 1454 | + fs.config.online_store.vector_enabled = True |
| 1455 | + |
| 1456 | + fv, _, df = _setup_documents_with_categories(fs) |
| 1457 | + vector_dim = 2 |
| 1458 | + |
| 1459 | + fs.config.embedding_model = EmbeddingModelConfig(model="text-embedding-3-small") |
| 1460 | + |
| 1461 | + fake_embedding = list(np.random.random(vector_dim)) |
| 1462 | + mock_embed_response = unittest.mock.MagicMock() |
| 1463 | + mock_embed_response.data = [{"embedding": fake_embedding}] |
| 1464 | + |
| 1465 | + with unittest.mock.patch( |
| 1466 | + "feast.feature_store.litellm_embedding", create=True |
| 1467 | + ) as mock_litellm: |
| 1468 | + mock_litellm.return_value = mock_embed_response |
| 1469 | + |
| 1470 | + with unittest.mock.patch( |
| 1471 | + "feast.feature_store.FeatureStore.retrieve_online_documents_openai", |
| 1472 | + wraps=fs.retrieve_online_documents_openai, |
| 1473 | + ): |
| 1474 | + # Patch the litellm import inside the method |
| 1475 | + with unittest.mock.patch.dict( |
| 1476 | + "sys.modules", |
| 1477 | + {"litellm": unittest.mock.MagicMock(embedding=mock_litellm)}, |
| 1478 | + ): |
| 1479 | + result = fs.retrieve_online_documents_openai( |
| 1480 | + vector_store_id="item_embeddings", |
| 1481 | + query="test query", |
| 1482 | + max_num_results=5, |
| 1483 | + ) |
| 1484 | + |
| 1485 | + # Validate top-level OpenAI response shape |
| 1486 | + assert result["object"] == "vector_store.search_results.page" |
| 1487 | + assert isinstance(result["search_query"], list) |
| 1488 | + assert result["search_query"] == ["test query"] |
| 1489 | + assert result["has_more"] is False |
| 1490 | + assert result["next_page"] is None |
| 1491 | + |
| 1492 | + assert isinstance(result["data"], list) |
| 1493 | + assert len(result["data"]) > 0 |
| 1494 | + assert len(result["data"]) <= 5 |
| 1495 | + |
| 1496 | + for item_result in result["data"]: |
| 1497 | + assert "file_id" in item_result |
| 1498 | + assert "filename" in item_result |
| 1499 | + assert item_result["filename"] == "item_embeddings" |
| 1500 | + assert "score" in item_result |
| 1501 | + assert isinstance(item_result["score"], float) |
| 1502 | + assert "attributes" in item_result |
| 1503 | + assert isinstance(item_result["attributes"], dict) |
| 1504 | + assert "content" in item_result |
| 1505 | + assert isinstance(item_result["content"], list) |
| 1506 | + for part in item_result["content"]: |
| 1507 | + assert "type" in part |
| 1508 | + assert part["type"] == "text" |
| 1509 | + assert "text" in part |
| 1510 | + |
| 1511 | + # --- Test with features_to_retrieve --- |
| 1512 | + with unittest.mock.patch.dict( |
| 1513 | + "sys.modules", |
| 1514 | + { |
| 1515 | + "litellm": unittest.mock.MagicMock( |
| 1516 | + embedding=unittest.mock.MagicMock(return_value=mock_embed_response) |
| 1517 | + ), |
| 1518 | + }, |
| 1519 | + ): |
| 1520 | + result_subset = fs.retrieve_online_documents_openai( |
| 1521 | + vector_store_id="item_embeddings", |
| 1522 | + query="test query", |
| 1523 | + max_num_results=5, |
| 1524 | + features_to_retrieve=["text_field", "category"], |
| 1525 | + ) |
| 1526 | + |
| 1527 | + assert len(result_subset["data"]) > 0 |
| 1528 | + for item_result in result_subset["data"]: |
| 1529 | + attr_keys = set(item_result["attributes"].keys()) |
| 1530 | + assert "embedding" not in attr_keys |
| 1531 | + |
| 1532 | + # --- Test with list query --- |
| 1533 | + with unittest.mock.patch.dict( |
| 1534 | + "sys.modules", |
| 1535 | + { |
| 1536 | + "litellm": unittest.mock.MagicMock( |
| 1537 | + embedding=unittest.mock.MagicMock(return_value=mock_embed_response) |
| 1538 | + ), |
| 1539 | + }, |
| 1540 | + ): |
| 1541 | + result_list = fs.retrieve_online_documents_openai( |
| 1542 | + vector_store_id="item_embeddings", |
| 1543 | + query=["term1", "term2"], |
| 1544 | + max_num_results=5, |
| 1545 | + ) |
| 1546 | + |
| 1547 | + assert result_list["search_query"] == ["term1", "term2"] |
| 1548 | + |
| 1549 | + |
| 1550 | +@pytest.mark.integration |
| 1551 | +@pytest.mark.universal_online_stores(only=["pgvector", "elasticsearch"]) |
| 1552 | +def test_retrieve_online_documents_openai_no_embedding_config( |
| 1553 | + environment, fake_document_data |
| 1554 | +): |
| 1555 | + """Test that retrieve_online_documents_openai raises ValueError |
| 1556 | + when embedding_model is not configured.""" |
| 1557 | + fs = environment.feature_store |
| 1558 | + fs.config.embedding_model = None |
| 1559 | + |
| 1560 | + with pytest.raises(ValueError, match="embedding_model is not configured"): |
| 1561 | + fs.retrieve_online_documents_openai( |
| 1562 | + vector_store_id="item_embeddings", |
| 1563 | + query="test query", |
| 1564 | + ) |
| 1565 | + |
| 1566 | + |
| 1567 | +@pytest.mark.integration |
| 1568 | +@pytest.mark.universal_online_stores(only=["pgvector", "elasticsearch"]) |
| 1569 | +def test_retrieve_online_documents_openai_not_found(environment, fake_document_data): |
| 1570 | + """Test that retrieve_online_documents_openai raises FeatureViewNotFoundException |
| 1571 | + for a non-existent feature view.""" |
| 1572 | + fs = environment.feature_store |
| 1573 | + fs.config.embedding_model = EmbeddingModelConfig(model="text-embedding-3-small") |
| 1574 | + |
| 1575 | + with pytest.raises(FeatureViewNotFoundException): |
| 1576 | + fs.retrieve_online_documents_openai( |
| 1577 | + vector_store_id="nonexistent_feature_view", |
| 1578 | + query="test query", |
| 1579 | + ) |
0 commit comments