-
Notifications
You must be signed in to change notification settings - Fork 253
Expand file tree
/
Copy pathfragment_utils.py
More file actions
312 lines (262 loc) · 9.16 KB
/
fragment_utils.py
File metadata and controls
312 lines (262 loc) · 9.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
from rmgpy.molecule.fragment import Fragment
from rmgpy.tools.canteramodel import Cantera
from rmgpy.chemkin import load_chemkin_file
import re
import os
import numpy as np
import matplotlib.pyplot as plt
def match_sequences(seq1, seq2, rtol=1e-6):
'''
Given two lists (each item is int or float):
seq1 and seq2 with same sum, the method returns
matched indices and values.
Example:
seq1 = [1, 3, 1]
seq2 = [2, 1, 2]
return: [[(0,0),1],
[(1,0),1],
[(1,1),1],
[(1,2),1],
[(2,2),1]]
'''
sum_diff = sum(seq2) - sum(seq1)
assert (
np.isclose(sum(seq1), sum(seq2), rtol=rtol)
), "seq1 has different sum (diff={0}) than seq2.".format(sum_diff)
# force the sum to be same if the difference
# is small enough
if sum_diff >= 0:
seq1[-1] = seq1[-1] + sum_diff
else:
seq2[-1] = seq2[-1] - sum_diff
# make cumulative sequences
cum_seq1 = np.cumsum(seq1)
cum_seq2 = np.cumsum(seq2)
# add index tags two both cumulative seqs
pin1 = 0
pin2 = 0
matched_indices = []
matched_cum_values = []
while pin1 < len(cum_seq1) and pin2 < len(cum_seq2):
matched_indices.append((pin1, pin2))
if cum_seq1[pin1] > cum_seq2[pin2]:
matched_cum_values.append(cum_seq2[pin2])
pin2 += 1
elif cum_seq1[pin1] < cum_seq2[pin2]:
matched_cum_values.append(cum_seq1[pin1])
pin1 += 1
else:
matched_cum_values.append(cum_seq2[pin2])
pin1 += 1
pin2 += 1
# get matches
matches = []
for i in range(len(matched_indices)):
matched_index_tup = matched_indices[i]
matched_cum_value = matched_cum_values[i]
if i == 0:
previous_cum_value = 0
else:
previous_cum_value = matched_cum_values[i - 1]
matches.append(
[matched_index_tup, matched_cum_value - previous_cum_value])
return matches
def match_concentrations_with_same_sums(conc1, conc2, rtol=1e-6):
'''match_concentrations_with_same_sums
Given two lists with each item to be a tuple
(species label, concentration)
conc1 and conc2 with same total concentrations,
the method returns matched species labels and
concentrations.
Example:
conc1 = [('a', 1),
('b', 3),
('c', 1)]
conc2 = [('x', 2),
('y', 1),
('z', 2)]
return: [(('a','x'),1),
(('b','x'),1),
(('b','y'),1),
(('b','z'),1),
(('c','z'),1)]
'''
labels1 = [tup[0] for tup in conc1]
labels2 = [tup[0] for tup in conc2]
seq1 = [tup[1] for tup in conc1]
seq2 = [tup[1] for tup in conc2]
matches_seq = match_sequences(seq1, seq2, rtol)
matches_conc = []
for match_seq in matches_seq:
matched_label_index1 = match_seq[0][0]
matched_label_index2 = match_seq[0][1]
matched_value = match_seq[1]
matched_label1 = labels1[matched_label_index1]
matched_label2 = labels2[matched_label_index2]
match_conc = ((matched_label1, matched_label2), matched_value)
matches_conc.append(match_conc)
return matches_conc
def match_concentrations_with_different_sums(conc1, conc2):
"""
Given two lists with each item to be a tuple
(species label, concentration)
conc1 and conc2 with different total concentrations,
the method returns matched species labels and
concentrations.
Example:
conc1 = [('a', 1),
('b', 3),
('c', 1)]
conc2 = [('x', 2),
('y', 1),
('z', 10)]
return: [(('a','x', 'z', 'z'),1),
(('b','x', 'z', 'z'),1),
(('b','y', 'z', 'z'),1),
(('b','z', 'z'),1),
(('c','z', 'z'),1)]
"""
labels1 = [tup[0] for tup in conc1]
labels2 = [tup[0] for tup in conc2]
seq1 = [tup[1] for tup in conc1]
seq2 = [tup[1] for tup in conc2]
matches_conc = []
pin1 = 0
pin2 = 0
val1 = seq1[pin1]
val2 = seq2[pin2]
while True:
if val1 > val2:
match = ((labels1[pin1], labels2[pin2]), val2)
matches_conc.append(match)
val1 = val1 - val2
pin2 += 1
if pin2 == len(seq2):
break
val2 = seq2[pin2]
elif val1 < val2:
match = ((labels1[pin1], labels2[pin2]), val1)
matches_conc.append(match)
val2 = val2 - val1
pin1 += 1
if pin1 == len(seq1):
break
val1 = seq1[pin1]
else:
match = ((labels1[pin1], labels2[pin2]), val1)
matches_conc.append(match)
pin1 += 1
pin2 += 1
if pin1 == len(seq1):
break
val1 = seq1[pin1]
if pin2 == len(seq2):
break
val2 = seq2[pin2]
# if pin2 first reaches the end
# append all the remaining seq1 to matches_conc
if pin2 == len(seq2) and pin1 < len(seq1):
remain_conc1 = [(labels1[pin1], val1)] + conc1[(pin1 + 1):]
matches_conc.extend(remain_conc1)
# if pin1 first reaches the end
# let matches_conc match with remaining seq2
elif pin1 == len(seq1) and pin2 < len(seq2):
remain_conc2 = [(labels2[pin2], val2)] + conc2[(pin2 + 1):]
matches_conc = match_concentrations_with_different_sums(
matches_conc, remain_conc2
)
# if pin1 and pin2 reach the ends at same time
# matches_conc is ready to return
return matches_conc
def shuffle(conc, seed=None):
"""
Randomly shuffle a list of fragments
"""
idx_arr = np.arange(len(conc))
if seed is not None:
np.random.seed(seed)
np.random.shuffle(idx_arr)
return [conc[idx] for idx in idx_arr]
def flatten(combo):
"""
Given a combo nested `tuple`, e.g.,
((('LY', 'XR'), ('LWL', 'RUR'))
return a list of labels contained in
the combo ['LY', 'XR', 'LWL', 'RUR']
"""
return_list = []
for i in combo:
if isinstance(i, tuple):
return_list.extend(flatten(i))
else:
return_list.append(i)
return return_list
# label should match the desired merging l/'abel on frag2
def merge_frag_to_frag(frag1, frag2, label):
from rmgpy.molecule import Bond
from rmgpy.molecule.fragment import Fragment, CuttingLabel
frag_spe1 = Fragment().from_smiles_like_string(frag1)
frag_spe2 = Fragment().from_smiles_like_string(frag2)
# find position of desired CuttingLabel
# need to find CuttingLabel on frag2 first
for vertex in frag_spe2.vertices:
if isinstance(vertex, CuttingLabel):
if vertex.symbol == label:
cut2 = vertex
atom2 = list(cut2.edges.keys())[0]
frag_spe2.remove_atom(cut2)
break
if cut2.symbol[0] == 'L':
Ctl = cut2.symbol.replace('L', 'R')
else: # that means this CuttingLabel is R something
Ctl = cut2.symbol.replace('R', 'L')
# merge to frag_spe1
for vertex in frag_spe1.vertices:
if isinstance(vertex, CuttingLabel):
if vertex.symbol == Ctl:
cut1 = vertex
atom1 = list(cut1.edges.keys())[0]
frag_spe1.remove_atom(cut1)
break
# new merged fragment
new_frag = frag_spe1.merge(frag_spe2)
new_frag.add_bond(Bond(atom1=atom1, atom2=atom2, order=1))
new_frag = new_frag.copy(deep=True)
new_frag.update()
return new_frag # return Fragment obtl
def merge_frag_list(to_be_merged):
import os
# merges fragments in list from right to left
species_list = []
ethylene = []
newlist = []
warnings = []
while len(to_be_merged) > 1:
# second to last fragmentin list
frag1 = to_be_merged[-2].smiles
frag2 = to_be_merged[-1].smiles # last fragment in list
if 'R' in frag1 and 'L' in frag2:
newfrag = merge_frag_to_frag(frag1, frag2, 'L')
elif 'L' in frag1 and 'R' in frag2:
newfrag = merge_frag_to_frag(frag1, frag2, 'R')
# warn user if last two fragments in list cannot be merged (no R/L
# combo to be made)
else:
print('Warning! Could not merge fragments {} and {}'.format(
frag1, frag2))
if 'L' in frag1 and 'L' in frag2:
newfrag = merge_frag_to_frag(
frag1.replace('L', 'R'), frag2, 'L')
if len(to_be_merged) > 2:
cut = len(to_be_merged) - 2
newfraglist = to_be_merged[:cut]
newfraglist.append(newfrag)
elif len(to_be_merged) == 2:
newfraglist = [newfrag]
to_be_merged = newfraglist
to_be_merged = newfraglist
# newlist.append(newfraglist) # if done merging list, write final
# structure to list of smiles structures
# print('{}% of fragments fully merged...'.format(np.round(100*(i+1)/len(flattened_matches_random)),1))
# print(newfraglist)
return newfraglist