Can someone explain the splitting of the list items based on a comparison with another list?

So to speak, I have two lists that contain the same string, but are separated in different ways:

sentences = ["This is a sentence", "so is this"]
phrases = ["This is", "a sentence so", "is this"]

What I'm trying to do is check to see if any element of the "phrase" list is not completely represented by one of the elements in the sentences, and then appropriately break this element of the "phrase". For example, in this case:

"a sentence so"

in “phrases” is partially represented in both elements 1 and 2 in “sentences” and therefore must be divided between “sentence” and “so” in order to create a new element.

“This” and “this” in “phrases” should be effectively ignored, since each of them fully corresponds to one element in “sentences”. After that, suppose that I wanted to make a counter of elements to determine how many are in each list, the result for “sentences” should be 2, but “phrases” should go from 3 to 4.

Sentencecount=0
Phrasecount=0
for i in sentences:
    Sentencecount+=1
for n in phrases:
#code here should check each element with 'sentences' elements and split    accordingly
    Phrasecount += 1

#expected result: phrases = ["This is", "a sentence", "so", "is this"]
+4
source share
3 answers

Well, it was harder - and more fun! - than I expected.

from collections import deque

def align_wordlists(words1, words2):
    # Split every element of the word lists
    # >>> [e.split(" ") for e in ["This is", "a sentence"]]
    # [["This", "is"], ["a", "sentence"]]
    words1_split = [e.split(" ") for e in words1]
    words2_split = [e.split(" ") for e in words2]

    # Assert that the flattened lists are identical
    assert [word for split in words1_split for word in split] == \
            [word for split in words2_split for word in split]

    # Create a queue and two tracking lists
    Q = deque(enumerate(words2_split))
    result = []
    splits = []

    # Keep track of the current sublist in words1
    words1_sublist_id = 0
    words1_sublist_offset = 0

    # Keep iterating until the queue is empty
    while Q:
        sublist_id, sublist = Q.popleft()
        sublist_len = len(sublist)

        words1_sublist_len = len(words1_split[words1_sublist_id])
        words1_remaining_len = words1_sublist_len - words1_sublist_offset

        if sublist_len <= words1_remaining_len:
            # The sublist fits entirely into the current segment in words 1,
            # add sublist untouched to resulting list.
            result.append(" ".join(sublist))

            # Update the sublist tracking
            if (words1_sublist_len - words1_sublist_offset - sublist_len) == 0:
                # The sublist filled the remaining space
                words1_sublist_id += 1
                words1_sublist_offset = 0
            else:
                # The sublist only filled part of the remaining space
                words1_sublist_offset += sublist_len

        else:
            # Only part of the current sublist fits.
            # Split the segment at the point where the left
            # part fits into the current segment of words1.
            # Then add the remaining right list to the front
            # of the queue.
            left = " ".join(sublist[:words1_remaining_len])
            right = sublist[words1_remaining_len:]
            result.append(left)
            Q.appendleft((sublist_id, right))

            # Keep track of splits
            splits.append(sublist_id)

            # update indices
            words1_sublist_id += 1
            words1_sublist_offset = 0

    # Combine splits into sublists to get desired result
    for split in splits:
        if isinstance(result[split], str):
            result[split:split+2] = [[result[split], result[split + 1]]]
        else:
            result[split] = result[split] + [result[split + 1]]
            del result[split + 1]

    return result

Examples

>>> words1 = ["This is a sentence", "so is this"]
>>> words2 = ["This is", "a sentence so", "is this"]
>>> align_wordlists(words1, words2)
['This is', ['a sentence', 'so'], 'is this']

>>> words1 = ["This is a longer", "sentence with", "different splits"]
>>> words2 = ["This is", "a longer sentence", "with different splits"]
>>> align_wordlists(words1, words2)
['This is', ['a longer', 'sentence'], ['with', 'different splits']]

>>> words1 = ["This is a longer", "sentence with", "different splits"]
>>> words2 = ["This is", "a longer sentence with different splits"]
>>> align_wordlists(words1, words2)
['This is', ['a longer', 'sentence with', 'different splits']]

Algorithm Overview

High level description of the algorithm used here. The problem you described boils down to the following question:

For each phrase in the second list of words, to which sentence in the first list does it belong?

, :

  • words1 words2 . , .

    def align_wordlists(words1, words2):
        # Split every element of the word lists
        # >>> [e.split(" ") for e in ["This is", "a sentence"]]
        # [["This", "is"], ["a", "sentence"]]
        words1_split = [e.split(" ") for e in words1]
        words2_split = [e.split(" ") for e in words2]
    
  • , , , , (.. ) , :

        # Assert that the flattened lists are identical
        assert [word for split in words1_split for word in split] == \
                [word for split in words2_split for word in split]
    
  • , , deque, , Python collections.

        # Create a queue and two tracking lists
        Q = deque(enumerate(words2_split))
        result = []
        splits = []
    

    , . . enumerate.

  • , - , .

        # Keep track of the current sublist in words1
        words1_sublist_id = 0
        words1_sublist_offset = 0
    
  • " ", , :

        # Keep iterating until the queue is empty
        while Q:
    
  • : . , 3 . sublist_id - , - , sublist , . , , .

            sublist_id, sublist = Q.popleft()
            sublist_len = len(sublist)
    
  • , , . ( words1_sublist_id 0, .)

            words1_sublist_len = len(words1_split[words1_sublist_id])
            words1_remaining_len = words1_sublist_len - words1_sublist_offset
    

    : " ?" , ​​.

  • IF: , ..: !

            if sublist_len <= words1_remaining_len:
    
    • , result ( join ing " ", . )

              # The sublist fits entirely into the current segment in words 1,
              # add sublist untouched to resulting list.
              result.append(" ".join(sublist))
      
    • , , , . , .

              # Update the sublist tracking
              if (words1_sublist_len - words1_sublist_offset - sublist_len) == 0:
                  # The sublist filled the remaining space
                  words1_sublist_id += 1
                  words1_sublist_offset = 0
              else:
                  # The sublist only filled part of the remaining space
                  words1_sublist_offset += sublist_len
      
  • ELSE: , .. ​​.

            else:
    
    • , . " " (, 3 , 2 , ).

              # Only part of the current sublist fits.
              # Split the segment at the point where the left
              # part fits into the current segment of words1.
              # Then add the remaining right list to the front
              # of the queue.
              left = " ".join(sublist[:words1_remaining_len])
              right = sublist[words1_remaining_len:]
      

      ( left "", join . right , , . )

    • , left result -list, , ​​ . right: , (. # 4).

      , right, : .. , .

              result.append(left)
              Q.appendleft((sublist_id, right))
      
    • result , , .

              # Keep track of splits
              splits.append(sublist_id)
      
    • , words1 -list. , , reset .

              # update indices
              words1_sublist_id += 1
              words1_sublist_offset = 0
      
  • , , . :

        # Combine splits into sublists to get desired result
        for split in splits:
    
    • , , , , . , , . ( split+2 split+1, .)

          if isinstance(result[split], str):
              result[split:split+2] = [[result[split], result[split + 1]]]
      
    • , , , , (.. , . # 4).

      result[split+1] del.

          else:
              result[split] = result[split] + [result[split + 1]]
              del result[split + 1]
      
  • , !

        return result
    
+4

, , , . , id - sentences. , .

, . . , - , . , - - .

, . , , , , .

-. ( ), . , .

sentences = ["This is a sentence", "so is this"]
phrases = ["This is", "a sentence so", "is this"]

sent_words = [ (id,w) for id,sent in enumerate(sentences) for w in sent.lower().split()]
sw = iter(sent_words)


new_phrases = []
for phrase in phrases:
    last_sid = None
    new_phrase = []
    words = []
    for p_id,p_word in enumerate(phrase.lower().split()):
        s_id,s_word = next(sw)
        assert p_word == s_word, "Text differs in sentence {}, phrase {}: '{}' vs. '{}'".format(s_id, p_id, s_word, p_word)

        #print("[{}] {} : [{}] {}".format(s_id, s_word, p_id, p_word))
        if last_sid is None:
            last_sid = s_id
            words.append(p_word)
        elif s_id != last_sid:
            new_phrase.append(' '.join(words))
            words = [p_word]
        else:
            words.append(p_word)
    else:
        if words:
            new_phrase.append(' '.join(words))

    if len(new_phrase) == 1:
        new_phrases.extend(new_phrase)
    else:
        new_phrases.append(new_phrase)

print(new_phrases)

['this is', ['a sentence', 'so'], 'is this']
0

, , , .
enumerate sentences phrases, zip . tuple :

def enumerate_x(d):
    return ((i, w) for i, f in enumerate(d) for w in f.split())

def align(sentences, phrases):
    r = {}
    for (i1, w1), (i2, w2) in zip(enumerate_x(sentences), enumerate_x(phrases)):
        assert w1 == w2, f'{w1} != {w2}'  # Py <3.6: '{} != {}'.format(w1, w2)
        r.setdefault((i1, i2), []).append(w1)
    return [' '.join(r[k]) for k in sorted(r)]

>>> sentences = ["This is a sentence", "so is this"]
>>> phrases = ["This is", "a sentence so", "is this"]
>>> align(sentences, phrases)
['This is', 'a sentence', 'so', 'is this']

This works with the other splits mentioned, for example:

>>> words1 = ["This is a longer", "sentence with", "different splits"]
>>> words2 = ["This is", "a longer sentence", "with different splits"]
>>> align(words1, words2)
['This is', 'a longer', 'sentence', 'with', 'different splits']
0
source

Source: https://habr.com/ru/post/1672571/


All Articles