Skip to content

Commit 4ce514b

Browse files
xPawLifeismana
authored andcommitted
parallel strings dedup & depots processing
1 parent db97359 commit 4ce514b

File tree

1 file changed

+122
-69
lines changed

1 file changed

+122
-69
lines changed

common.sh

Lines changed: 122 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -28,57 +28,71 @@ if [[ $# -gt 0 ]]; then
2828
fi
2929
fi
3030

31-
# ProcessDepot - Processes binary files of a given type by dumping protobufs and extracting strings.
32-
# @param $1 - File extension to process (e.g. .dll, .so, .dylib, .exe)
31+
# _StringsPath - Derives the _strings.txt path from a binary file path and its extension.
32+
# .exe files append _strings.txt (foo.exe -> foo.exe_strings.txt),
33+
# other extensions replace the suffix (foo.dll -> foo_strings.txt).
34+
# @param $1 - File path
35+
# @param $2 - File extension
36+
_StringsPath ()
37+
{
38+
if [[ "$2" == ".exe" ]]; then
39+
echo "${1}_strings.txt"
40+
else
41+
echo "${1/%$2/_strings.txt}"
42+
fi
43+
}
44+
45+
# _ProcessBinary - Processes a single binary file by dumping protobufs and extracting strings.
46+
# @param $1 - File path to process
47+
# @param $2 - File extension (e.g. .dll, .so, .dylib, .exe)
48+
_ProcessBinary ()
49+
{
50+
local file="$1"
51+
local ext="$2"
52+
53+
# Skip common not game-specific binaries
54+
local name
55+
name="$(basename "$file" "$ext")"
56+
if [[ "$name" = "steamclient" ]] || [[ "$name" = "libcef" ]]; then
57+
return
58+
fi
59+
60+
echo " $file"
61+
62+
# Extract protobuf definitions from the binary
63+
"$PROTOBUF_DUMPER_PATH" "$file" "Protobufs/" > /dev/null
64+
65+
# Extract readable strings from the binary, sort and deduplicate them
66+
"$DUMP_STRINGS_PATH" -binary "$file" | sort --unique > "$(_StringsPath "$file" "$ext")"
67+
}
68+
69+
# ProcessDepot - Processes binary files by dumping protobufs and extracting strings.
70+
# @param $@ - File extensions to process (e.g. .dll .so .dylib .exe)
3371
ProcessDepot ()
3472
{
35-
echo "::group::Processing binaries ($1)"
73+
echo "::group::Processing binaries ($*)"
3674

3775
# rm -r "Protobufs"
3876
mkdir -p "Protobufs"
3977

40-
# Map the file extension to the binary format type for the strings dumper
41-
local file_type=""
42-
case "$1" in
43-
.dylib)
44-
file_type="macho"
45-
;;
46-
.so)
47-
file_type="elf"
48-
;;
49-
.dll|.exe)
50-
file_type="pe"
51-
;;
52-
*)
53-
echo "Unknown file type $1"
54-
echo "::endgroup::"
55-
return
56-
esac
57-
58-
# Find all files matching the given extension and process each one
59-
while IFS= read -r -d '' file
60-
do
61-
# Skip common not game-specific binaries
62-
if [[ "$(basename "$file" "$1")" = "steamclient" ]] || [[ "$(basename "$file" "$1")" = "libcef" ]]
63-
then
64-
continue
65-
fi
66-
67-
echo " $file"
78+
local max_jobs=10
79+
local job_count=0
6880

69-
# Extract protobuf definitions from the binary
70-
"$PROTOBUF_DUMPER_PATH" "$file" "Protobufs/" > /dev/null
81+
for ext in "$@"; do
82+
# Find all files matching the given extension and process each one
83+
while IFS= read -r -d '' file
84+
do
85+
_ProcessBinary "$file" "$ext" &
7186

72-
# Derive the output strings filename by replacing the extension with _strings.txt
73-
if [[ "$1" == ".exe" ]]; then
74-
strings_file="${file}_strings.txt"
75-
else
76-
strings_file="$(echo "$file" | sed -e "s/$(echo "$1" | sed 's/\./\\./g')$/_strings.txt/g")"
77-
fi
87+
((++job_count))
88+
if ((job_count >= max_jobs)); then
89+
wait -n
90+
((job_count--))
91+
fi
92+
done < <(find . -type f -name "*$ext" -print0)
93+
done
7894

79-
# Extract readable strings from the binary, sort and deduplicate them
80-
"$DUMP_STRINGS_PATH" -binary "$file" -target "$file_type" | sort --unique > "$strings_file"
81-
done < <(find . -type f -name "*$1" -print0)
95+
wait
8296

8397
echo "::endgroup::"
8498
}
@@ -94,26 +108,52 @@ ProcessVPK ()
94108
echo " $file"
95109

96110
# Write the VPK's file list to a .txt file with the same name
97-
"$VRF_PATH" --input "$file" --vpk_list > "$(echo "$file" | sed -e 's/\.vpk$/\.txt/g')"
111+
"$VRF_PATH" --input "$file" --vpk_list > "${file/%.vpk/.txt}"
98112
done < <(find . -type f -name "*_dir.vpk" -print0)
99113

100114
echo "::endgroup::"
101115
}
102116

117+
# _DeduplicateStringsFile - Deduplicates a single strings file against a merged reference.
118+
# @param $1 - Strings file to deduplicate
119+
# @param $2 - Merged dedupe reference file
120+
_DeduplicateStringsFile ()
121+
{
122+
local target_file="$1"
123+
local merged_dedupe="$2"
124+
125+
# Remove lines present in reference files and replace the original
126+
comm -23 "$target_file" "$merged_dedupe" > "$target_file.tmp"
127+
mv "$target_file.tmp" "$target_file"
128+
}
129+
103130
# DeduplicateStringsFrom - Removes duplicate string lines from extracted strings files
104131
# by filtering out lines that appear in the provided dedupe reference files.
105-
# @param $1 - File suffix to match binaries (e.g. .dll, .so)
106-
# @param $@ - One or more reference files whose lines will be subtracted from other strings files
132+
# @param -- - Separator between suffixes and reference files
133+
# @usage DeduplicateStringsFrom .dll .exe -- file1.txt file2.txt
107134
DeduplicateStringsFrom ()
108135
{
109-
suffix="$1"
110-
shift
136+
# Split arguments into suffixes (before --) and reference files (after --)
137+
local suffixes=()
138+
local ref_files=()
139+
local found_separator=0
140+
141+
for arg in "$@"; do
142+
if [[ "$arg" == "--" ]]; then
143+
found_separator=1
144+
elif ((found_separator)); then
145+
ref_files+=("$arg")
146+
else
147+
suffixes+=("$arg")
148+
fi
149+
done
111150

112-
echo "::group::Deduplicating strings ($suffix)"
151+
echo "::group::Deduplicating strings (${suffixes[*]})"
113152

114153
# Resolve all dedupe reference files to absolute paths, warn if missing
115-
dedupe_files=()
116-
for file in "$@"; do
154+
local dedupe_files=()
155+
for file in "${ref_files[@]}"; do
156+
local resolved
117157
resolved="$(realpath "$file")"
118158
if [[ -f "$resolved" ]]; then
119159
dedupe_files+=("$resolved")
@@ -123,31 +163,44 @@ DeduplicateStringsFrom ()
123163
done
124164

125165
# Merge all reference files into a single sorted set
166+
local merged_dedupe
126167
merged_dedupe="$(mktemp)"
127168
sort --unique --merge "${dedupe_files[@]}" > "$merged_dedupe"
128169

129-
# Iterate over all binaries matching the suffix and process their strings files
130-
while IFS= read -r -d '' file
131-
do
132-
# Derive the corresponding _strings.txt path from the binary path
133-
target_file="$(realpath "$file" | sed -e "s/$(echo "$suffix" | sed 's/\./\\./g')$/_strings.txt/g")"
170+
local max_jobs=10
171+
local job_count=0
134172

135-
# Skip if no strings file exists for this binary
136-
if ! [[ -f "$target_file" ]]; then
137-
continue
138-
fi
173+
for suffix in "${suffixes[@]}"; do
174+
# Iterate over all binaries matching the suffix and process their strings files
175+
while IFS= read -r -d '' file
176+
do
177+
# Derive the corresponding _strings.txt path from the binary path
178+
local target_file
179+
target_file="$(_StringsPath "$(realpath "$file")" "$suffix")"
139180

140-
# Don't deduplicate a file against itself
141-
for dedupe_file in "${dedupe_files[@]}"; do
142-
if [[ "$dedupe_file" = "$target_file" ]]; then
143-
continue 2
181+
# Skip if no strings file exists for this binary
182+
if ! [[ -f "$target_file" ]]; then
183+
continue
144184
fi
145-
done
146185

147-
# Remove lines present in reference files and replace the original
148-
comm -23 "$target_file" "$merged_dedupe" > "$target_file.tmp"
149-
mv "$target_file.tmp" "$target_file"
150-
done < <(find . -type f -name "*$suffix" -print0)
186+
# Don't deduplicate a file against itself
187+
for dedupe_file in "${dedupe_files[@]}"; do
188+
if [[ "$dedupe_file" = "$target_file" ]]; then
189+
continue 2
190+
fi
191+
done
192+
193+
_DeduplicateStringsFile "$target_file" "$merged_dedupe" &
194+
195+
((++job_count))
196+
if ((job_count >= max_jobs)); then
197+
wait -n
198+
((job_count--))
199+
fi
200+
done < <(find . -type f -name "*$suffix" -print0)
201+
done
202+
203+
wait
151204

152205
rm -f "$merged_dedupe"
153206

@@ -165,7 +218,7 @@ ProcessToolAssetInfo ()
165218
echo " $file"
166219

167220
# Dump asset info in short format, replacing .bin extension with .txt
168-
"$VRF_PATH" --input "$file" --output "$(echo "$file" | sed -e 's/\.bin$/\.txt/g')" --tools_asset_info_short || echo "S2V failed to dump tools asset info"
221+
"$VRF_PATH" --input "$file" --output "${file/%.bin/.txt}" --tools_asset_info_short || echo "S2V failed to dump tools asset info"
169222
done < <(find . -type f -name "*asset_info.bin" -print0)
170223

171224
echo "::endgroup::"

0 commit comments

Comments
 (0)