Python replaces multiple lines by supporting backlinks

There are several useful ways to handle multiple string replacement at the same time in python. However, I am unable to create an efficient function that can do this, as well as support backlinks.

I would like to use a dictionary of expression / replacement terms, where replacement terms may contain backlinks to something matched by the expression.

eg. (pay attention to \ 1)

repdict = {'&&':'and', '||':'or', '!([a-zA-Z_])':'not \1'}

I put the SO answer, mentioned at the beginning, in the function below, which is great for expression / replacement pairs that don't contain backlinks:

def replaceAll(repdict, text):
    repdict = dict((re.escape(k), v) for k, v in repdict.items())
    pattern = re.compile("|".join(repdict.keys()))
    return pattern.sub(lambda m: repdict[re.escape(m.group(0))], text)

However, it does not work for a key that contains a backlink.

>>> replaceAll(repldict, "!newData.exists() || newData.val().length == 1")
'!newData.exists() or newData.val().length == 1'

If I do it manually, it works fine. eg:.

pattern = re.compile("!([a-zA-Z_])")
pattern.sub(r'not \1', '!newData.exists()')

Works as expected:

'not newData.exists()'

, , , backref, -.

. , backrefs , replacer:

def replaceAll(repPat, text):
    def replacer(obj):
        match = obj.group(0)
        # manually deal with exclamation mark match..
        if match[:1] == "!": return 'not ' + match[1:]
        # here we naively escape the matched pattern into
        # the format of our dictionary key
        else: return repPat[naive_escaper(match)]

    pattern = re.compile("|".join(repPat.keys()))
    return pattern.sub(replacer, text)

def naive_escaper(string):
    if '=' in string: return string.replace('=', '\=')
    elif '|' in string: return string.replace('|', '\|')
    else: return string

# manually escaping \ and = works fine
repPat = {'!([a-zA-Z_])':'', '&&':'and', '\|\|':'or', '\=\=\=':'=='}
replaceAll(repPat, "(!this && !that) || !this && foo === bar")

:

'(not this and not that) or not this'

, - , , , .

+4
3

: . .


, dict .

. a repldict :

repldict = {r'(a)': r'\1a', r'(b)': r'\1b'}

, (a)|(b) - (b) 1, , .

, , . b, , \1b ? ; .

, dict :

(?P<group1>(a))|(?P<group2>(b))

, . \1b " 2".


:

def replaceAll(repldict, text):
    # split the dict into two lists because we need the order to be reliable
    keys, repls = zip(*repldict.items())

    # generate a regex pattern from the keys, putting each key in a named group
    # so that we can find out which one of them matched.
    # groups are named "_<idx>" where <idx> is the index of the corresponding
    # replacement text in the list above
    pattern = '|'.join('(?P<_{}>{})'.format(i, k) for i, k in enumerate(keys))

    def repl(match):
        # find out which key matched. We know that exactly one of the keys has
        # matched, so it the only named group with a value other than None.
        group_name = next(name for name, value in match.groupdict().items()
                          if value is not None)
        group_index = int(group_name[1:])

        # now that we know which group matched, we can retrieve the
        # corresponding replacement text
        repl_text = repls[group_index]

        # now we'll manually search for backreferences in the
        # replacement text and substitute them
        def repl_backreference(m):
            reference_index = int(m.group(1))

            # return the corresponding group value from the original match
            # +1 because regex starts counting at 1
            return match.group(group_index + reference_index + 1)  

        return re.sub(r'\\(\d+)', repl_backreference, repl_text)

    return re.sub(pattern, repl, text)

:

repldict = {'&&':'and', r'\|\|':'or', r'!([a-zA-Z_])':r'not \1'}
print( replaceAll(repldict, "!newData.exists() || newData.val().length == 1") )

repldict = {'!([a-zA-Z_])':r'not \1', '&&':'and', r'\|\|':'or', r'\=\=\=':'=='}
print( replaceAll(repldict, "(!this && !that) || !this && foo === bar") )

# output: not newData.exists() or newData.val().length == 1
#         (not this and not that) or not this and foo == bar

:

  • ; .
  • , {r'(a)': r'\2'}. ( , .)
+3

Rawing, , . , .

, , . , replacer ( match.expand).

import re
from collections import OrderedDict
from functools import partial

pattern_to_replacement = {'&&': 'and', '!([a-zA-Z_]+)': r'not \1'}


def build_replacer(cases):
    ordered_cases = OrderedDict(cases.items())
    replacements = {}

    leading_groups = 0
    for pattern, replacement in ordered_cases.items():
        leading_groups += 1

        # leading_groups is now the absolute position of the root group (back-references should be relative to this)
        group_index = leading_groups
        replacement = absolute_backreference(replacement, group_index)
        replacements[group_index] = replacement

        # This pattern contains N subgroups (determine by compiling pattern)
        subgroups = re.compile(pattern).groups
        leading_groups += subgroups

    catch_all = "|".join("({})".format(p) for p in ordered_cases)
    pattern = re.compile(catch_all)

    def replacer(match):
        replacement_pattern = replacements[match.lastindex]
        return match.expand(replacement_pattern)

    return partial(pattern.sub, replacer)


def absolute_backreference(text, n):
    ref_pat = re.compile(r"\\([0-99])")

    def replacer(match):
        return "\\{}".format(int(match.group(1)) + n)

    return ref_pat.sub(replacer, text)


replacer = build_replacer(pattern_to_replacement)
print(replacer("!this.exists()"))
+2

Simple is better than complex, the code as shown below is more readable (the reason you don't work as expected is that ([a-zA-Z_]) should not be in re.escape):

repdict = {
    r'\s*' + re.escape('&&')) + r'\s*': ' and ',
    r'\s*' + re.escape('||') + r'\s*': ' or ',
    re.escape('!') + r'([a-zA-Z_])': r'not \1',
}
def replaceAll(repdict, text):
    for k, v in repdict.items():
        text = re.sub(k, v, text)
    return text
0
source

Source: https://habr.com/ru/post/1681550/


All Articles