Rearranging the substring counts the update after replacing it with an "X"

Given the line:

s = 'cdababef'

We compute the before and character after:

def per_window(sequence, n=1):
    """
    From http://stackoverflow.com/q/42220614/610569
        >>> list(per_window([1,2,3,4], n=2))
        [(1, 2), (2, 3), (3, 4)]
        >>> list(per_window([1,2,3,4], n=3))
        [(1, 2, 3), (2, 3, 4)]
    """
    start, stop = 0, n
    seq = list(sequence)
    while stop <= len(seq):
        yield tuple(seq[start:stop])
        start += 1
        stop += 1

char_before= defaultdict(Counter)
char_after = defaultdict(Counter) 
for window in per_window(s, 3):
    char_after[window[:2]][window[2]] += 1
    char_before[window[1:]][window[0]] += 1

[output]:

>>> char_after
defaultdict(collections.Counter,
            {('a', 'b'): Counter({'a': 1, 'e': 1}),
             ('b', 'a'): Counter({'b': 1}),
             ('b', 'e'): Counter({'f': 1}),
             ('c', 'd'): Counter({'a': 1}),
             ('d', 'a'): Counter({'b': 1})})

>>> char_before
defaultdict(collections.Counter,
            {('a', 'b'): Counter({'b': 1, 'd': 1}),
             ('b', 'a'): Counter({'a': 1}),
             ('b', 'e'): Counter({'a': 1}),
             ('d', 'a'): Counter({'c': 1}),
             ('e', 'f'): Counter({'b': 1})})

Say, if I replaced all the instances abwith x, and I need to update the counts of char_afterand char_before, and the goal is to achieve without recounting all substringss = 'cdxxef' , for example:

s = 'cdxxef'
char_before2 = defaultdict(Counter)
char_after2 = defaultdict(Counter) 
for window in per_window(s, 3):
    char_after2[window[:2]][window[2]] += 1
    char_before2[window[1:]][window[0]] += 1

[desired outputs]:

>>> char_before2
defaultdict(collections.Counter,
            {('d', 'x'): Counter({'c': 1}),
             ('e', 'f'): Counter({'x': 1}),
             ('x', 'e'): Counter({'x': 1}),
             ('x', 'x'): Counter({'d': 1})})

>>> char_after2
defaultdict(collections.Counter,
            {('c', 'd'): Counter({'x': 1}),
             ('d', 'x'): Counter({'x': 1}),
             ('x', 'e'): Counter({'f': 1}),
             ('x', 'x'): Counter({'e': 1})})

How can a substring be updated without recounting the entire substring, but only substrings affected by substitutions?


I tried:

s = 'cdababef'

char_before= defaultdict(Counter)
char_after = defaultdict(Counter) 
for window in per_window(s, 3):
    char_after[window[:2]][window[2]] += 1
    char_before[window[1:]][window[0]] += 1

source, target = ('a', 'b'), 'x'
for ch in char_before[source]:
    count_before = char_before[source][ch]
    char_before[target][ch] += count_before
    char_before[source][ch] = 0

    count_after = char_after[source][ch]
    char_after[target][ch] += count_after
    char_before[source][ch] = 0

But the conclusion is not desirable, as with char_before2and char_after2:

>>> char_before
defaultdict(collections.Counter,
            {'x': Counter({'b': 1, 'd': 1}),
             ('b', 'a'): Counter({'a': 1}),
             ('d', 'a'): Counter({'c': 1}),
             ('b', 'e'): Counter({'a': 1}),
             ('a', 'b'): Counter({'b': 0, 'd': 0}),
             ('e', 'f'): Counter({'b': 1})})

>>> char_after
defaultdict(collections.Counter,
            {'x': Counter({'b': 0, 'd': 0}),
             ('b', 'a'): Counter({'b': 1}),
             ('d', 'a'): Counter({'b': 1}),
             ('b', 'e'): Counter({'f': 1}),
             ('a', 'b'): Counter({'a': 1, 'e': 1}),
             ('c', 'd'): Counter({'a': 1})})

+4
source share
3 answers

, :

  • ,
  • char_before char_after,
  • char_before char_after

.

source, target = ('a', 'b'), 'x'
n = 3

char_before= defaultdict(Counter)
char_after = defaultdict(Counter) 
for window in per_window(s, n):
    char_after[window[:2]][window[2]] += 1
    char_before[window[1:]][window[0]] += 1

( ) , ( , - )

import re 

spans = [m.span() for m in re.finditer(''.join(source), s)]

, , , , . , , . , s = 'cdababef', 'ab' 'x', 'cd' char_after , 'cd' .

, merge_spans, ((2,4) (4,6) (2,6)), , extra ( extra - , ). , , , / .

def merge_spans(spans, extra = 0):
    extra = max(0,extra)
    merged = spans[:]
    if len(merged) == 1:
        return [(max(merged[0][0]-extra, 0), merged[0][-1]+extra)]
    for i in range(1, len(merged)):
        span = merged[i]
        prev = merged[i-1]
        if prev[-1]+extra >= span[0]-extra:
            merged[i] = (max(0,prev[0]-extra), span[-1]+extra)
            merged[i-1] = ()
        elif i == len(merged)-1:
            merged[i] = (max(0,span[0]-extra), span[-1]+extra)
            merged[i-1] = (max(0,prev[0]-extra), prev[-1]+extra)
        else:
            merged[i-1] = (max(0,prev[0]-extra), prev[-1]+extra)
    return list(filter(None, merged))       

, . extra n-1, n-1 .

merged = merge_spans(spans, n-1)   

, . .

for span in merged:
    sub_s = s[span[0]:span[-1]]
    for window in per_window(sub_s, n):
        char_after[window[:2]][window[2]] -= 1
        char_before[window[1:]][window[0]] -= 1
    new_s = sub_s.replace(''.join(source), target)
    for window in per_window(new_s, n):
        char_after[window[:2]][window[2]] += 1
        char_before[window[1:]][window[0]] += 1 

, char_before char_after, , - .

, , 0 , , . , Counter() , .

char_before2 = {k:v+Counter() for k,v in char_before.items() if any(v.values())}
char_after2 = {k:v+Counter() for k,v in char_after.items() if any(v.values())}

:

>>> char_before2
{('d', 'x'): Counter({'c': 1}),
 ('e', 'f'): Counter({'x': 1}),
 ('x', 'e'): Counter({'x': 1}),
 ('x', 'x'): Counter({'d': 1})}

>>> char_after2 
{('c', 'd'): Counter({'x': 1}),
 ('d', 'x'): Counter({'x': 1}),
 ('x', 'e'): Counter({'f': 1}),
 ('x', 'x'): Counter({'e': 1})}
+5

, :

, , . , , .

. , ('ab' s). 'x', ('x', 'x'). , , , , , ('d', 'x').

: , s='cdababaef', char_after[('a','b')]['a']=2, char_after[('x','x')]['a']=1.

s='cdabaabaef' char_after[('a','b')]['a']=2, char_after[('x','x')]['a']=2.

, , : , , source ('ab' ) ? s ( char_before char_after s, , , .)

, . , , , . , , , SE.

, , - , .

+3

, - , . , , . , ( ) . , , / , , . , , , .

bunji, , / . , :

from collections import defaultdict
from collections import Counter
import re
from copy import deepcopy

def chars_before_after(s, bin_size):
    def per_window(sequence, n=1):
        """
        From http://stackoverflow.com/q/42220614/610569
            >>> list(per_window([1,2,3,4], n=2))
            [(1, 2), (2, 3), (3, 4)]
            >>> list(per_window([1,2,3,4], n=3))
            [(1, 2, 3), (2, 3, 4)]
        """
        start, stop = 0, n
        seq = list(sequence)
        while stop <= len(seq):
            yield tuple(seq[start:stop])
            start += 1
            stop += 1

    char_before= defaultdict(Counter)
    char_after = defaultdict(Counter)
    for window in per_window(s, bin_size+1):
        char_after[window[:bin_size]][window[-1]] += 1
        char_before[window[1:]][window[0]] += 1

    return char_before, char_after

def replace_chars_recount(s, source, target, char_before, char_after, verbose=False):

    if verbose:
        print('s=' + s + ', source=' + source, 'target=' + target)

        print('char_before')
        for char_counter in char_before.items():
            print(char_counter)

        print('\nchar_after')
        for char_counter in char_after.items():
            print(char_counter)

    char_before = deepcopy(char_before)
    char_after = deepcopy(char_after)
    replaced_s = ''
    source_len = len(source)
    source_start = 0
    source_stop = source_len
    target_pos = []
    target_len = len(target)
    last_replacement = 0

    while source_start < len(s):
        if verbose: print('start_index=' + str(source_start))

        if s[source_start:source_stop] == source:
            replaced_s += target

            before_start = max(source_start-source_len+last_replacement,0)
            before_end = before_start+source_len
            while before_start < source_stop and before_end < len(s):
                before_chars = tuple(s[before_start:before_end])
                if verbose: print('Removing "'+ s[before_end] +'" from after "' + s[before_start:before_end] + '".')
                char_after[before_chars][s[before_end]] -= 1
                before_start += 1
                before_end += 1

            after_end = min(len(s), source_stop+source_len)
            after_start = after_end-source_len
            while after_end > source_start+last_replacement and after_start>0:
                after_chars = tuple(s[after_start:after_end])
                if verbose: print('Removing "' + s[after_start-1] + '" from before "' + s[after_start:after_end] + '".')
                char_before[after_chars][s[after_start-1]] -= 1
                after_start -= 1
                after_end -= 1

            target_pos.append(len(replaced_s) - target_len)
            source_start += source_len
            source_stop += source_len
            last_replacement = source_len

        else:
            replaced_s += s[source_start]
            source_start += 1
            source_stop += 1
            last_replacement = max(0, last_replacement-1)

    last_target = 0-target_len
    for target in target_pos:
        if verbose: print('target_pos=' + str(target))
        before_target_start = max(target-source_len, last_target+target_len, 0)
        before_target_end = before_target_start+source_len
        while before_target_start <= target+target_len-1 and before_target_end < len(replaced_s):
            before_chars = tuple(replaced_s[before_target_start:before_target_end])
            if verbose: print('Adding "' + replaced_s[before_target_end] + '" to after "' + replaced_s[before_target_start:before_target_end] + '".')
            char_after[before_chars][replaced_s[before_target_end]] += 1
            before_target_start += 1
            before_target_end += 1

        after_end = min(len(replaced_s), target + target_len+ source_len)
        after_start = after_end - source_len
        while after_end > max(target, last_target+source_len+target_len) and after_start>0:
            after_chars = tuple(replaced_s[after_start:after_end])
            if verbose: print('Adding "' + replaced_s[after_start - 1] + '" to before "' + replaced_s[after_start:after_end] + '".')
            char_before[after_chars][replaced_s[after_start - 1]] += 1
            after_start -= 1
            after_end -= 1

        last_target=target

    char_before = {k:v+Counter() for k,v in char_before.items() if any(v.values())}
    char_after = {k:v+Counter() for k,v in char_after.items() if any(v.values())}

    if verbose:
        print('replaced_s=' + replaced_s)
        print('char_before')
        for char_counter in char_before.items():
            print(char_counter)

        print('\nchar_after')
        for char_counter in char_after.items():
            print(char_counter)

    return replaced_s, char_before, char_after


def test_replace_chars_recount(s, source, target, verbose=False):
    char_before, char_after = chars_before_after(s, len(source))
    replaced_s, char_before, char_after = replace_chars_recount(s, source, target, char_before, char_after, verbose)
    correct_replaced = re.sub(source, target, s)
    correct_before, correct_after = chars_before_after(replaced_s, len(source))
    correct_answer = correct_replaced==replaced_s and correct_before==char_before and correct_after==char_after

    print('{:>20} {:<20} {:<10} {:<10} {:<5}'.format(s, replaced_s, source, target, str(correct_answer)))

test_cases = [{'s': 'cdababef', 'source': 'ab', 'target': 'x'},
              {'s': 'cdabqabef', 'source': 'ab', 'target': 'x'},
              {'s': 'cdabgabgef', 'source': 'abg', 'target': 'x'},
              {'s': 'cdabgqabgef', 'source': 'abg', 'target': 'x'},
              {'s': 'cdababef', 'source': 'ab', 'target': 'xy'},
              {'s': 'cdababef', 'source': 'ab', 'target': 'xyz'},
              {'s': 'cdababef', 'source': 'a', 'target': 'x'},
              {'s': 'cdababef', 'source': 'a', 'target': 'xyz'},
              {'s': 'ababef', 'source': 'ab', 'target': 'x'},
              {'s': 'cdabab', 'source': 'ab', 'target': 'x'},
              {'s': 'cdababef', 'source': 'xy', 'target': 'x'},
              {'s': 'cdababef', 'source': 'ab', 'target': ''},
              {'s': 'cdabcdabcdef', 'source': 'abcd', 'target': 'x'},
              {'s': 'cdabcdeabcdeabcdeef', 'source': 'abcde', 'target': 'x'},
              {'s': 'cdababef', 'source': 'a', 'target': 'abcd'},
              {'s': 'aaaaa', 'source': 'a', 'target': 'x'},
              {'s': 'aaaaa', 'source': 'a', 'target': 'xy'},
              {'s': '', 'source': '', 'target': ''}]

print('{:>20} {:<20} {:<10} {:<10} {:<5}'.format('Input String', 'Output String', 'Source', 'Target', 'Correct Result?'))

for test_case in test_cases:
    test_replace_chars_recount(test_case['s'], test_case['source'], test_case['target'])

:

    Input String Output String        Source     Target     Correct Result?
            cdababef cdxxef               ab         x          True 
           cdabqabef cdxqxef              ab         x          True 
          cdabgabgef cdxxef               abg        x          True 
         cdabgqabgef cdxqxef              abg        x          True 
            cdababef cdxyxyef             ab         xy         True 
            cdababef cdxyzxyzef           ab         xyz        True 
            cdababef cdxbxbef             a          x          True 
            cdababef cdxyzbxyzbef         a          xyz        True 
              ababef xxef                 ab         x          True 
              cdabab cdxx                 ab         x          True 
            cdababef cdababef             xy         x          True 
            cdababef cdef                 ab                    True 
        cdabcdabcdef cdxxef               abcd       x          True 
 cdabcdeabcdeabcdeef cdxxxef              abcde      x          True 
            cdababef cdabcdbabcdbef       a          abcd       True 
               aaaaa xxxxx                a          x          True 
               aaaaa xyxyxyxyxy           a          xy         True 
                                                                True 

Thus, this approach works regardless of the length of the source / target. The only limitation in the current implementation is that the length of the source must be the same as the size of the hopper for counting the characters before and after. However, you can change this to make it more flexible.

+2
source

Source: https://habr.com/ru/post/1675877/


All Articles