How to split a string into a string, unless the string is in quotation marks in python?

I would like to split the next line with the word “and”, unless the word “and” is in quotation marks

string = "section_category_name = 'computer and equipment expense' and date >= 2015-01-01 and date <= 2015-03-31"

Desired Result

["section_category_name = 'computer and equipment expense'","date >= 2015-01-01","date <= 2015-03-31"]

I can’t find the correct regex pattern that correctly separates the string, so that “computer and hardware costs” are not shared.

Here is what I tried:

re.split('and',string)

Result

[" section_category_name = 'computer "," equipment expense' ",' date >= 2015-01-01 ',' date <= 2015-03-31']

As you can see, the result broke the "consumption of computers and equipment" into different elements in the list.

I also tried the following from this question :

r = re.compile('(?! )[^[]+?(?= *\[)'
               '|'
               '\[.+?\]')
r.findall(s)

Result:

[]

I also tried the following from question

result = re.split(r"and+(?=[^()]*(?:\(|$))", string)

Result:

[" section_category_name = 'computer ",
 " equipment expense' ",
 ' date >= 2015-01-01 ',
 ' date <= 2015-03-31']

, , .

,

string = " section_category_name = (computer and equipment expense) and date >= 2015-01-01 and date <= 2015-03-31"
result = re.split(r"and+(?=[^()]*(?:\(|$))", string)

[' section_category_name = (computer and equipment expense) ',
 ' date >= 2015-01-01 ',
 ' date <= 2015-03-31']

, ''

+4
6

re.findall:

((?:(?!\band\b)[^'])*(?:'[^'\\]*(?:\\.[^'\\]*)*'(?:(?!\band\b)[^'])*)*)(?:and|$)

regex.

, a ', and ( (?:(?!\band\b)[^'])*) ( escape-) ( '[^'\\]*(?:\\.[^'\\]*)*' - ([^'\\]|\\.)*).

Python :

import re
p = re.compile(r'((?:(?!\band\b)[^\'])*(?:\'[^\'\\]*(?:\\.[^\'\\]*)*\'(?:(?!\band\b)[^\'])*)*)(?:and|$)')
s = "section_category_name = 'computer and equipment expense' and date >= 2015-01-01 and date <= 2015-03-31"
print([x for x in p.findall(s) if x])
0

re.findall 2- , , , - -, .

itertools.groupby "" ( ), -comp, :

import re
from itertools import groupby

text = "section_category_name = 'computer and equipment expense'      and date >= 2015-01-01 and date <= 2015-03-31 and blah = 'ooops'"
items = [
    ' '.join(el[0] or el[1] for el in g)
    for k, g in groupby(re.findall("('.*?')|(\S+)", text), lambda L: L[1] == 'and')
    if not k
]

:

["section_category_name = 'computer and equipment expense'",
 'date >= 2015-01-01',
 'date <= 2015-03-31',
 "blah = 'ooops'"]

, - , ...

- , lambda L: L[1] == 'and' lambda L: L[1] in ('and', 'or') , ..

+1

, , 3 . ". - " ". .

import re

string = "section_category_name = 'computer and equipment expense' and date >= 2015-01-01 and date <= 2015-03-31"

output = re.match(r"(^.+['].+['])\sand\s(.+)\sand\s(.+)", string).groups()
print(output)

. . , "section_category_name" - .

section_category_name = 'something here' and ...
0

, .

import re

# We create a "lexer" using regex. This will match strings surrounded by single quotes,
# words without any whitespace in them, and the end of the string. We then use finditer()
# to grab all non-overlapping tokens.
lexer = re.compile(r"'[^']*'|[^ ]+|$")

string = "section_category_name = 'computer and equipment expense' and date >= 2015-01-01 and date <= 2015-03-31"

results = []
buff = []

# Iterate through all the tokens our lexer identified and parse accordingly
for match in lexer.finditer(string):
    token = match.group(0) # group 0 is the entire matching string

    if token in ('and', ''):
        # Once we reach 'and' or the end of the string '' (matched by $)
        # We join all previous tokens with a space and add to our results.
        results.append(' '.join(buff))
        buff = [] # Reset for the next set of tokens
    else:
        buff.append(token)

print results

: , for itertools.groupby.

import re
from itertools import groupby

string = "section_category_name = 'computer and equipment expense' and date >= 2015-01-01 and date <= 2015-03-31"

lexer = re.compile(r"'[^']*'|[^\s']+")
grouping = groupby(lexer.findall(string), lambda x: x == 'and')
results = [ ' '.join(g) for k, g in grouping if not k ]

print results

0

, re.split :

, .

None . , .

>>> tokens = re.split(r"('[^']*')|and", string)
# ['section_category_name = ', "'computer and equipment expense'", ' ', None, ' date >= 2015-01-01 ', None, ' date <= 2015-03-31']    
>>> ''.join([t if t else '\0' for t in tokens]).split('\0')
["section_category_name = 'computer and equipment expense' ", ' date >= 2015-01-01 ', ' date <= 2015-03-31']

. 0x00 char , , , .

0

, , and, , and . , 'hello and and bye', 'hello andand bye'?

, "", , :

string = "section_category_name = 'computer and equipment expense' and date >= 2015-01-01 and date <= 2015-03-31"
res = []
spl = 'and'
for idx, sub in enumerate(string.split("'")):
  if idx % 2 == 0:
    subsub = sub.split(spl)
    for jdx in range(1, len(subsub) - 1):
      subsub[jdx] = subsub[jdx].strip()
    if len(subsub) > 1:
      subsub[0] = subsub[0].rstrip()
      subsub[-1] = subsub[-1].lstrip()
    res += [i for i in subsub if i.strip()]
  else:
    quoted_str = "'" + sub + "'"
    if res:
      res[-1] += quoted_str
    else:
      res.append(quoted_str)

An even simpler solution, if you know what andwill be surrounded by space on both sides and that it will not be repeated and does not want to remove extra spaces:

string = "section_category_name = 'computer and equipment expense' and date >= 2015-01-01 and date <= 2015-03-31"
spl = 'and'
res = []
spaced_spl = ' ' + spl + ' '
for idx, sub in enumerate(string.split("'")):
  if idx % 2 == 0:
    res += [i for i in sub.split(spaced_spl) if i.strip()]
  else:
    quoted_str = "'" + sub + "'"
    if res:
      res[-1] += quoted_str
    else:
      res.append(quoted_str)

Output:

["section_category_name = 'computer and equipment expense'", 'date >= 2015-01-01', 'date <= 2015-03-31']
0
source

Source: https://habr.com/ru/post/1621533/


All Articles