Miscellaneous fixes (#506)

abernardi597 · web-flow · commit abf3ef29a860 · 2026-02-09T13:21:08.000-05:00
* Add knn-reuse/ to .gitignore

* Fix newline param in open()

* Avoid extra numpy conversion

* Use benchUtil.PERF_EXE in knnPerfTest

* Improve subprocess loop
diff --git a/.gitignore b/.gitignore
@@ -29,7 +29,7 @@ src/python/localconstants.py
 *.class
 *.pyc
 *.swp
-npm-debug.log 
+npm-debug.log
 target/
 local/
 tmp/
@@ -43,3 +43,6 @@ knnIndices/
 # emacs temporary files
 *~
 #*
+
+# index files
+knn-reuse/
diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py
@@ -29,6 +29,7 @@
 import benchUtil
 import constants
 import ps_head
+from benchUtil import PERF_EXE
 from common import getLuceneDirFromGradleProperties
 
 # Measure vector search recall and latency while exploring hyperparameters
@@ -58,8 +59,6 @@
 # Set this to True to generate the disassembled code to verify the intended SIMD instructions are getting used or not
 PERF_MODE = False
 
-PERF_PATH = shutil.which("perf")
-
 # e.g. to compile KnnIndexer:
 #
 #   javac -d build -cp /l/trunk/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar:/l/trunk/lucene/join/build/libs/lucene-join-10.0.0-SNAPSHOT.jar src/main/knn/*.java src/main/WikiVectors.java src/main/perf/VectorDictionary.java
@@ -351,9 +350,9 @@ def run_knn_benchmark(checkout, values, log_path):
       ]
     )
 
-    if PERF_MODE and PERF_PATH:
+    if PERF_MODE and PERF_EXE:
       print("Will be recording the executed instructions in perf.data file")
-      perf_cmd = [PERF_PATH, "record", "-e", "instructions:u", "-o", f"perf{index_run}.data", "-g"] + this_cmd
+      perf_cmd = [PERF_EXE, "record", "-e", "instructions:u", "-o", f"perf{index_run}.data", "-g"] + this_cmd
       job = subprocess.run(perf_cmd, check=False)
       if NOISY:
         print(f"  cmd: {perf_cmd}")
@@ -389,14 +388,13 @@ def run_knn_benchmark(checkout, values, log_path):
       re_summary = re.compile(r"^SUMMARY: (.*?)$", re.MULTILINE)
       summary = None
       hit_exception = False
-      lines = ""
-      while True:
+      while job.poll() is None:
         line = job.stdout.readline()
-        if line == "":
-          break
-        lines += line
+        if not line:
+          continue
         if NOISY:
           sys.stdout.write(line)
+          sys.stdout.flush()
         m = re_summary.match(line)
         if m is not None:
           summary = m.group(1)
@@ -421,11 +419,11 @@ def run_knn_benchmark(checkout, values, log_path):
 
     if hit_exception:
       raise RuntimeError("unhandled java exception while running")
-    if summary is None:
-      raise RuntimeError("could not find summary line in output! " + lines)
     job.wait()
     if job.returncode != 0:
       raise RuntimeError(f"command failed with exit {job.returncode}")
+    if summary is None:
+      raise RuntimeError("could not find summary line in output! ")
     all_results.append((summary, args))
     if DO_PROFILING:
       benchUtil.profilerOutput(constants.JAVA_EXE, jfr_output, benchUtil.checkoutToPath(checkout), 30, (1,))
diff --git a/src/python/load_cohere_v3.py b/src/python/load_cohere_v3.py
@@ -112,6 +112,7 @@ def main():
     # takes a long time!  ~3.2 hours
     # NOTE: train is the only split
     docs = datasets.load_dataset(DATASET_NAME, LANG, split="train", streaming=True)
+    docs = docs.with_format("numpy")
     # print(f'columns: {docs.column_names}')
 
     features = docs.features
@@ -172,7 +173,7 @@ def main():
 
       next_print_time_sec = start_time_sec
       last_vec_out_pos = 0
-      with open(csv_source_file, "w", newlines="") as meta_out, open(vec_source_file, "wb") as vec_out:
+      with open(csv_source_file, "w", newline="") as meta_out, open(vec_source_file, "wb") as vec_out:
         meta_csv_out = csv.writer(meta_out, lineterminator="\n")
         meta_csv_out.writerow(headers)
         for doc in docs:
@@ -187,8 +188,10 @@ def main():
             total_doc_count += 1
             cur_wiki_id = wiki_id
 
-          emb = np.array(doc["emb"], dtype=np.float32)
+          emb = doc["emb"]
 
+          if emb.dtype != np.dtype("<f4"):
+            raise RuntimeError(f"planned on little-endian float32 encoding but corpus is {emb.dtype}!")
           if len(emb) != DIMENSIONS:
             raise RuntimeError(f"planned on {DIMENSIONS} dims but corpus is {len(emb)}!")
           # print(f'{type(emb)}')