Well, it was harder - and more fun! - than I expected.
from collections import deque
def align_wordlists(words1, words2):
words1_split = [e.split(" ") for e in words1]
words2_split = [e.split(" ") for e in words2]
assert [word for split in words1_split for word in split] == \
[word for split in words2_split for word in split]
Q = deque(enumerate(words2_split))
result = []
splits = []
words1_sublist_id = 0
words1_sublist_offset = 0
while Q:
sublist_id, sublist = Q.popleft()
sublist_len = len(sublist)
words1_sublist_len = len(words1_split[words1_sublist_id])
words1_remaining_len = words1_sublist_len - words1_sublist_offset
if sublist_len <= words1_remaining_len:
result.append(" ".join(sublist))
if (words1_sublist_len - words1_sublist_offset - sublist_len) == 0:
words1_sublist_id += 1
words1_sublist_offset = 0
else:
words1_sublist_offset += sublist_len
else:
left = " ".join(sublist[:words1_remaining_len])
right = sublist[words1_remaining_len:]
result.append(left)
Q.appendleft((sublist_id, right))
splits.append(sublist_id)
words1_sublist_id += 1
words1_sublist_offset = 0
for split in splits:
if isinstance(result[split], str):
result[split:split+2] = [[result[split], result[split + 1]]]
else:
result[split] = result[split] + [result[split + 1]]
del result[split + 1]
return result
Examples
>>> words1 = ["This is a sentence", "so is this"]
>>> words2 = ["This is", "a sentence so", "is this"]
>>> align_wordlists(words1, words2)
['This is', ['a sentence', 'so'], 'is this']
>>> words1 = ["This is a longer", "sentence with", "different splits"]
>>> words2 = ["This is", "a longer sentence", "with different splits"]
>>> align_wordlists(words1, words2)
['This is', ['a longer', 'sentence'], ['with', 'different splits']]
>>> words1 = ["This is a longer", "sentence with", "different splits"]
>>> words2 = ["This is", "a longer sentence with different splits"]
>>> align_wordlists(words1, words2)
['This is', ['a longer', 'sentence with', 'different splits']]
Algorithm Overview
High level description of the algorithm used here. The problem you described boils down to the following question:
For each phrase in the second list of words, to which sentence in the first list does it belong?
, :
words1 words2 . , .
def align_wordlists(words1, words2):
# Split every element of the word lists
# >>> [e.split(" ") for e in ["This is", "a sentence"]]
# [["This", "is"], ["a", "sentence"]]
words1_split = [e.split(" ") for e in words1]
words2_split = [e.split(" ") for e in words2]
, , , , (.. ) , :
assert [word for split in words1_split for word in split] == \
[word for split in words2_split for word in split]
, , deque, , Python collections.
Q = deque(enumerate(words2_split))
result = []
splits = []
, . . enumerate.
, - , .
words1_sublist_id = 0
words1_sublist_offset = 0
" ", , :
while Q:
: . , 3 . sublist_id - , - , sublist , . , , .
sublist_id, sublist = Q.popleft()
sublist_len = len(sublist)
, , . ( words1_sublist_id 0, .)
words1_sublist_len = len(words1_split[words1_sublist_id])
words1_remaining_len = words1_sublist_len - words1_sublist_offset
: " ?" , .
IF: , ..: !
if sublist_len <= words1_remaining_len:
, result ( join ing " ", . )
result.append(" ".join(sublist))
, , , . , .
if (words1_sublist_len - words1_sublist_offset - sublist_len) == 0:
words1_sublist_id += 1
words1_sublist_offset = 0
else:
words1_sublist_offset += sublist_len
ELSE: , .. .
else:
, . " " (, 3 , 2 , ).
left = " ".join(sublist[:words1_remaining_len])
right = sublist[words1_remaining_len:]
( left "", join . right , , . )
, left result -list, , . right: , (. # 4).
, right, : .. , .
result.append(left)
Q.appendleft((sublist_id, right))
result , , .
splits.append(sublist_id)
, words1 -list. , , reset .
words1_sublist_id += 1
words1_sublist_offset = 0
, , . :
for split in splits:
, , , , . , , . ( split+2 split+1, .)
if isinstance(result[split], str):
result[split:split+2] = [[result[split], result[split + 1]]]
, , , , (.. , . # 4).
result[split+1] del.
else:
result[split] = result[split] + [result[split + 1]]
del result[split + 1]
, !
return result