Skip to content

Commit 7f9b86b

Browse files
committed
Use hadd for merging ROOT files with podio-merge-files
1 parent dec6b3c commit 7f9b86b

3 files changed

Lines changed: 223 additions & 29 deletions

File tree

python/podio/root_io.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,18 @@ def __init__(self, filenames):
2626

2727
super().__init__()
2828

29+
def set_cache_size(self, size):
30+
"""Set the TTreeCache size used when reading.
31+
32+
A value of 0 disables the cache. The cache is set up per category the
33+
first time that category is read, so this method should be called before
34+
iterating over frames. The default cache size is 256 MiB.
35+
36+
Args:
37+
size (int): Cache size in bytes
38+
"""
39+
self._reader.setCacheSize(size)
40+
2941

3042
class RNTupleReader(BaseReaderMixin):
3143
"""Reader class for reading podio RNTuple root files."""

python/podio/root_merge.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/usr/bin/env python3
2+
"""Utilities for merging podio ROOT files (TTree and RNTuple)."""
3+
4+
import os
5+
import shutil
6+
import subprocess
7+
import tempfile
8+
9+
import ROOT
10+
11+
from podio.frame import Frame
12+
from podio.utils import convert_to_str_paths
13+
from podio.root_io import Reader, RNTupleReader, RNTupleWriter, Writer
14+
15+
16+
def _get_tree_names(filename):
17+
"""Return the names of all TTrees in a ROOT file"""
18+
f = ROOT.TFile.Open(filename)
19+
if not f or f.IsZombie():
20+
raise RuntimeError(f"Cannot open file: {filename}")
21+
names = {elem.GetName() for elem in f.GetListOfKeys() if elem.GetClassName() == "TTree"}
22+
f.Close()
23+
return names
24+
25+
26+
def _get_rntuple_names(filename):
27+
"""Return the names of all RNTuples in a ROOT file"""
28+
f = ROOT.TFile.Open(filename)
29+
if not f or f.IsZombie():
30+
raise RuntimeError(f"Cannot open file: {filename}")
31+
names = {elem.GetName() for elem in f.GetListOfKeys() if elem.GetClassName() == "ROOT::RNTuple"}
32+
f.Close()
33+
return names
34+
35+
36+
def _hadd_path():
37+
"""Return the path to the hadd executable, or raise if not found"""
38+
path = shutil.which("hadd")
39+
if path is None:
40+
raise RuntimeError(
41+
"hadd not found on PATH. hadd is required by podio.root_merge.merge_files "
42+
"and is distributed with ROOT."
43+
)
44+
return path
45+
46+
47+
def _merge_files_impl(output_file, input_files, metadata, get_names_fn, reader_cls, writer_cls, fmt_name):
48+
"""Common implementation for merge_files and merge_files_rntuple.
49+
50+
Args:
51+
output_file (str): Path of the output file to create.
52+
input_files (list[str]): Ordered list of input files (already str).
53+
metadata (str): ``"first"``, ``"all"``, or ``"none"``.
54+
get_names_fn: Callable returning the set of object names in a file.
55+
reader_cls: Reader class to use (Reader or RNTupleReader).
56+
writer_cls: Writer class to use (Writer or RNTupleWriter).
57+
fmt_name (str): Format label used in error messages (e.g. "TTree").
58+
"""
59+
has_metadata_cat = "metadata" in get_names_fn(input_files[0])
60+
61+
hadd = _hadd_path()
62+
result = subprocess.run([hadd, "-f", "-k", output_file] + input_files, capture_output=True, text=True)
63+
if result.returncode != 0:
64+
raise RuntimeError(f"hadd failed (exit {result.returncode}):\n{result.stderr}")
65+
66+
# Delete the incorrectly merged metadata category (rewritten below)
67+
out_f = ROOT.TFile.Open(output_file, "UPDATE")
68+
if not out_f or out_f.IsZombie():
69+
raise RuntimeError(f"Cannot open for UPDATE: {output_file}")
70+
try:
71+
if has_metadata_cat:
72+
out_f.Delete("metadata;*")
73+
out_f.Write("", ROOT.TObject.kOverwrite)
74+
finally:
75+
out_f.Close()
76+
77+
if metadata == "none":
78+
return
79+
80+
# Write the corrected metadata category via a temp file, then copy only
81+
# the metadata object into the output using TFileMerger
82+
if has_metadata_cat:
83+
src_reader = reader_cls([input_files[0]] if metadata == "first" else input_files)
84+
frames = list(src_reader.get("metadata"))
85+
else:
86+
frames = [Frame()]
87+
88+
for frame in frames:
89+
frame.put_parameter("MergedInputFiles", list(input_files))
90+
91+
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".root")
92+
os.close(tmp_fd)
93+
try:
94+
tmp_writer = writer_cls(tmp_path)
95+
for frame in frames:
96+
tmp_writer.write_frame(frame, "metadata")
97+
tmp_writer._writer.finish()
98+
99+
m = ROOT.TFileMerger(ROOT.kFALSE)
100+
m.OutputFile(output_file, "UPDATE")
101+
m.AddFile(tmp_path)
102+
m.SetFastMethod(ROOT.kTRUE)
103+
m.AddObjectNames("metadata")
104+
if not m.PartialMerge(ROOT.TFileMerger.kAll | ROOT.TFileMerger.kOnlyListed | ROOT.TFileMerger.kIncremental):
105+
raise RuntimeError(f"TFileMerger failed adding metadata {fmt_name} to {output_file}")
106+
finally:
107+
if os.path.exists(tmp_path):
108+
os.unlink(tmp_path)
109+
110+
111+
def merge_files(output_file, input_files, metadata="first"):
112+
"""Merge podio TTree files.
113+
114+
Uses hadd for the event data, then rewrites the ``metadata`` category
115+
(with the ``MergedInputFiles`` parameter set) via a temp file
116+
and TFileMerger.
117+
118+
All input files must have been written with the same category and
119+
collection layout.
120+
121+
Args:
122+
output_file (str or Path): Path of the output file to create.
123+
input_files (list[str] or list[Path]): Ordered list of input files.
124+
metadata (str): How to handle the ``metadata`` Frame category.
125+
``"first"`` – copy only the first file's entry (default).
126+
``"all"`` – copy entries from every file.
127+
``"none"`` – omit the metadata category entirely.
128+
129+
Raises:
130+
ValueError: If *input_files* is empty or *metadata* is not one of the
131+
accepted values.
132+
RuntimeError: If a file cannot be opened, hadd is not found, or
133+
trees are inconsistent.
134+
"""
135+
if not input_files:
136+
raise ValueError("input_files must not be empty")
137+
if metadata not in ("first", "all", "none"):
138+
raise ValueError(f"metadata must be 'first', 'all', or 'none', got {metadata!r}")
139+
140+
input_files = [str(p) for p in convert_to_str_paths(input_files)]
141+
output_file = str(convert_to_str_paths(output_file)[0])
142+
143+
_merge_files_impl(output_file, input_files, metadata, _get_tree_names, Reader, Writer, "TTree")
144+
145+
146+
def merge_files_rntuple(output_file, input_files, metadata="first"):
147+
"""Merge podio RNTuple files.
148+
149+
Uses hadd for the event data, then rewrites the ``metadata`` category
150+
(with the ``MergedInputFiles`` parameter set) via a temp file
151+
and TFileMerger.
152+
153+
All input files must have been written with the same category and
154+
collection layout.
155+
156+
Args:
157+
output_file (str or Path): Path of the output file to create.
158+
input_files (list[str] or list[Path]): Ordered list of input files.
159+
metadata (str): How to handle the ``metadata`` Frame category.
160+
``"first"`` – copy only the first file's entry (default).
161+
``"all"`` – copy entries from every file.
162+
``"none"`` – omit the metadata category entirely.
163+
164+
Raises:
165+
ValueError: If *input_files* is empty or *metadata* is not one of the
166+
accepted values.
167+
RuntimeError: If a file cannot be opened, hadd is not found, or
168+
ntuples are inconsistent.
169+
"""
170+
if not input_files:
171+
raise ValueError("input_files must not be empty")
172+
if metadata not in ("first", "all", "none"):
173+
raise ValueError(f"metadata must be 'first', 'all', or 'none', got {metadata!r}")
174+
175+
input_files = [str(p) for p in convert_to_str_paths(input_files)]
176+
output_file = str(convert_to_str_paths(output_file)[0])
177+
178+
_merge_files_impl(output_file, input_files, metadata, _get_rntuple_names, RNTupleReader, RNTupleWriter, "RNTuple")

tools/podio-merge-files

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ args = parser.parse_args()
2222

2323
# Import podio later for quick help messages
2424
import podio # pylint: disable=wrong-import-position # noqa: E402
25-
import podio.root_io # pylint: disable=wrong-import-position # noqa: E402
2625
from podio import reading # pylint: disable=wrong-import-position # noqa: E402
26+
from podio.root_io import Reader, RNTupleReader # pylint: disable=wrong-import-position # noqa: E402
27+
from podio.root_merge import merge_files, merge_files_rntuple # pylint: disable=wrong-import-position # noqa: E402
2728

2829
all_files = set()
2930
for f in args.files:
@@ -32,37 +33,40 @@ for f in args.files:
3233
all_files.add(f)
3334

3435
reader = reading.get_reader(args.files)
35-
if isinstance(reader, podio.root_io.Reader):
36-
writer = podio.root_io.Writer(args.output_file)
37-
elif isinstance(reader, podio.root_io.RNTupleReader):
38-
writer = podio.root_io.RNTupleWriter(args.output_file)
36+
37+
if isinstance(reader, Reader):
38+
merge_files(args.output_file, args.files, metadata=args.metadata)
39+
elif isinstance(reader, RNTupleReader):
40+
merge_files_rntuple(args.output_file, args.files, metadata=args.metadata)
3941
else:
40-
raise ValueError(f"Input file {args.files[0]} is not a TTree or RNTuple file")
42+
# Slow path: frame-by-frame copy
43+
from podio import sio_io # pylint: disable=wrong-import-position # noqa: E402
44+
writer = sio_io.Writer(args.output_file)
4145

42-
categories = list(reader.categories)
43-
is_metadata_available = True # pylint: disable=invalid-name
44-
try:
45-
# All frames will be copied as they are except the metadata ones
46-
categories.remove("metadata")
47-
except ValueError:
48-
is_metadata_available = False # pylint: disable=invalid-name
46+
categories = list(reader.categories)
47+
is_metadata_available = True # pylint: disable=invalid-name
48+
try:
49+
# All frames will be copied as they are except the metadata ones
50+
categories.remove("metadata")
51+
except ValueError:
52+
is_metadata_available = False # pylint: disable=invalid-name
4953

50-
for category in tqdm(categories):
51-
all_frames = reader.get(category)
52-
for frame in tqdm(all_frames, desc=f"Merging category '{category}'"):
53-
writer.write_frame(frame, category)
54+
for category in tqdm(categories):
55+
all_frames = reader.get(category)
56+
for frame in tqdm(all_frames, desc=f"Merging category '{category}'"):
57+
writer.write_frame(frame, category)
5458

55-
if args.metadata == "none":
56-
sys.exit(0)
59+
if args.metadata == "none":
60+
sys.exit(0)
5761

58-
if not is_metadata_available:
59-
print("Warning: metadata category 'metadata' not found in the input files, it will be created")
60-
all_frames = [podio.Frame()]
61-
else:
62-
if args.metadata == "first":
63-
all_frames = [reader.get("metadata")[0]]
62+
if not is_metadata_available:
63+
print("Warning: metadata category 'metadata' not found in the input files, it will be created")
64+
all_frames = [podio.Frame()]
6465
else:
65-
all_frames = reader.get("metadata")
66-
for frame in all_frames:
67-
frame.put_parameter("MergedInputFiles", args.files)
68-
writer.write_frame(frame, "metadata")
66+
if args.metadata == "first":
67+
all_frames = [reader.get("metadata")[0]]
68+
else:
69+
all_frames = reader.get("metadata")
70+
for frame in all_frames:
71+
frame.put_parameter("MergedInputFiles", args.files)
72+
writer.write_frame(frame, "metadata")

0 commit comments

Comments
 (0)