Skip to content

Commit abf3ef2

Browse files
authored
Miscellaneous fixes (#506)
* Add knn-reuse/ to .gitignore * Fix newline param in open() * Avoid extra numpy conversion * Use benchUtil.PERF_EXE in knnPerfTest * Improve subprocess loop
1 parent 2051914 commit abf3ef2

3 files changed

Lines changed: 18 additions & 14 deletions

File tree

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ src/python/localconstants.py
2929
*.class
3030
*.pyc
3131
*.swp
32-
npm-debug.log
32+
npm-debug.log
3333
target/
3434
local/
3535
tmp/
@@ -43,3 +43,6 @@ knnIndices/
4343
# emacs temporary files
4444
*~
4545
#*
46+
47+
# index files
48+
knn-reuse/

src/python/knnPerfTest.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import benchUtil
3030
import constants
3131
import ps_head
32+
from benchUtil import PERF_EXE
3233
from common import getLuceneDirFromGradleProperties
3334

3435
# Measure vector search recall and latency while exploring hyperparameters
@@ -58,8 +59,6 @@
5859
# Set this to True to generate the disassembled code to verify the intended SIMD instructions are getting used or not
5960
PERF_MODE = False
6061

61-
PERF_PATH = shutil.which("perf")
62-
6362
# e.g. to compile KnnIndexer:
6463
#
6564
# javac -d build -cp /l/trunk/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar:/l/trunk/lucene/join/build/libs/lucene-join-10.0.0-SNAPSHOT.jar src/main/knn/*.java src/main/WikiVectors.java src/main/perf/VectorDictionary.java
@@ -351,9 +350,9 @@ def run_knn_benchmark(checkout, values, log_path):
351350
]
352351
)
353352

354-
if PERF_MODE and PERF_PATH:
353+
if PERF_MODE and PERF_EXE:
355354
print("Will be recording the executed instructions in perf.data file")
356-
perf_cmd = [PERF_PATH, "record", "-e", "instructions:u", "-o", f"perf{index_run}.data", "-g"] + this_cmd
355+
perf_cmd = [PERF_EXE, "record", "-e", "instructions:u", "-o", f"perf{index_run}.data", "-g"] + this_cmd
357356
job = subprocess.run(perf_cmd, check=False)
358357
if NOISY:
359358
print(f" cmd: {perf_cmd}")
@@ -389,14 +388,13 @@ def run_knn_benchmark(checkout, values, log_path):
389388
re_summary = re.compile(r"^SUMMARY: (.*?)$", re.MULTILINE)
390389
summary = None
391390
hit_exception = False
392-
lines = ""
393-
while True:
391+
while job.poll() is None:
394392
line = job.stdout.readline()
395-
if line == "":
396-
break
397-
lines += line
393+
if not line:
394+
continue
398395
if NOISY:
399396
sys.stdout.write(line)
397+
sys.stdout.flush()
400398
m = re_summary.match(line)
401399
if m is not None:
402400
summary = m.group(1)
@@ -421,11 +419,11 @@ def run_knn_benchmark(checkout, values, log_path):
421419

422420
if hit_exception:
423421
raise RuntimeError("unhandled java exception while running")
424-
if summary is None:
425-
raise RuntimeError("could not find summary line in output! " + lines)
426422
job.wait()
427423
if job.returncode != 0:
428424
raise RuntimeError(f"command failed with exit {job.returncode}")
425+
if summary is None:
426+
raise RuntimeError("could not find summary line in output! ")
429427
all_results.append((summary, args))
430428
if DO_PROFILING:
431429
benchUtil.profilerOutput(constants.JAVA_EXE, jfr_output, benchUtil.checkoutToPath(checkout), 30, (1,))

src/python/load_cohere_v3.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def main():
112112
# takes a long time! ~3.2 hours
113113
# NOTE: train is the only split
114114
docs = datasets.load_dataset(DATASET_NAME, LANG, split="train", streaming=True)
115+
docs = docs.with_format("numpy")
115116
# print(f'columns: {docs.column_names}')
116117

117118
features = docs.features
@@ -172,7 +173,7 @@ def main():
172173

173174
next_print_time_sec = start_time_sec
174175
last_vec_out_pos = 0
175-
with open(csv_source_file, "w", newlines="") as meta_out, open(vec_source_file, "wb") as vec_out:
176+
with open(csv_source_file, "w", newline="") as meta_out, open(vec_source_file, "wb") as vec_out:
176177
meta_csv_out = csv.writer(meta_out, lineterminator="\n")
177178
meta_csv_out.writerow(headers)
178179
for doc in docs:
@@ -187,8 +188,10 @@ def main():
187188
total_doc_count += 1
188189
cur_wiki_id = wiki_id
189190

190-
emb = np.array(doc["emb"], dtype=np.float32)
191+
emb = doc["emb"]
191192

193+
if emb.dtype != np.dtype("<f4"):
194+
raise RuntimeError(f"planned on little-endian float32 encoding but corpus is {emb.dtype}!")
192195
if len(emb) != DIMENSIONS:
193196
raise RuntimeError(f"planned on {DIMENSIONS} dims but corpus is {len(emb)}!")
194197
# print(f'{type(emb)}')

0 commit comments

Comments
 (0)