Python - delete all words containing other words in a list

I have a list filled with words from a dictionary. I want to find a way to delete all words, only considering the root words that are formed at the beginning of the target word.

For example, the word "rodeo" will be removed from the list because it contains the English word "ride." The typewriter will be deleted because it contains the English word type. However, the word "snicker" remains valid even if it contains the word "nick" because "nick" is in the middle and not at the beginning of the word.

I thought something like this:

 for line in wordlist:
        if line.find(...) --

but I want the if statement to then execute each individual word in the list, checking to see if it is found and, if so, to be removed from the list so that only the root words remain. Should I create a copy of the list of words to go through?

+3
source share
7 answers

I assume that you have only one list from which you want to remove any elements with prefixes in the same list.

#Important assumption here... wordlist is sorted

base=wordlist[0]                      #consider the first word in the list
for word in wordlist:                 #loop through the entire list checking if
    if not word.startswith(base):     # the word we're considering starts with the base
        print base                    #If not... we have a new base, print the current
        base=word                     #  one and move to this new one
    #else word starts with base
        #don't output word, and go on to the next item in the list
print base                            #finish by printing the last base

EDIT: Added some comments to make the logic more obvious.

+5
source

So, you have two lists: a list of words that you want to check and possibly delete, and a list of valid words. If you like, you can use the same list for both purposes, but I assume that you have two lists.

. , - . , . "a" "I" , , "a", , ?

/usr/share/dict/words Ubuntu. ; , , , . , "k" "q", "z" .. , , , , - . , .

:

# build valid list from /usr/dict/share/words
wfile = "/usr/dict/share/words"
valid = set(line.strip() for line in open(wfile) if len(line) >= 3)

lst = ["ark", "booze", "kite", "live", "rodeo"]

def subwords(word):
    for i in range(len(word) - 1, 0, -1):
        w = word[:i]
        yield w

newlst = []
for word in lst:
    # uncomment these for debugging to make sure it works
    # print "subwords", [w for w in subwords(word)]
    # print "valid subwords", [w for w in subwords(word) if w in valid]
    if not any(w in valid for w in subwords(word)):
        newlst.append(word)

print(newlst)

, "" :

newlst = [word for word in lst if not any(w in valid for w in subwords(word))]

, , , , . ​​

, , , :

def keep(word):
    return not any(w in valid for w in subwords(word))

newlst = [word for word in lst if keep(word)]

Python , , .

+6

, jkerian asnwer ( ), , .

( ):

wordlist = ["a","arc","arcane","apple","car","carpenter","cat","zebra"];

def root_words(wordlist):
    result = []
    base = wordlist[0]
    for word in wordlist:
        if not word.startswith(base):
            result.append(base)
            base=word
    result.append(base)
    return result;

print root_words(wordlist);

( , ), . , , , . , "" "" , - "".

+1

lambda. , .

words = ['rode', 'nick'] # this is the list of all the words that you have.
                         # I'm using 'rode' and 'nick' as they're in your example
listOfWordsToTry = ['rodeo', 'snicker']
def validate(w):
    for word in words:
        if w.startswith(word):
            return False
    return True

wordsThatDontStartWithValidEnglishWords = \
    filter(lambda x : validate(x), listOfWordsToTry)

, .

,

+1

, : , , . , , , trie .

, , .

trie :

http://en.wikipedia.org/wiki/Trie

Python . , dict, Trie . "" , "" Trie. "" - , Python.

, , ; . .

"cat" "catch" trie, "c" , "a" "t" ( "c" "" ). node "a" "" "t" ( , "cat" ), node "c" , "h" ( "catch" ). , "catch" "cat" node . trie .

def _pad(n):
    return " " * n

class Trie(object):
    def __init__(self):
        self.t = {}  # dict mapping symbols to sub-tries
        self.w = {}  # dict listing terminal symbols at this level

    def add(self, word):
        if 0 == len(word):
            return
        cur = self
        for ch in word[:-1]: # add all symbols but terminal
            if ch not in cur.t:
                cur.t[ch] = Trie()
            cur = cur.t[ch]
        ch = word[-1]
        cur.w[ch] = True  # add terminal

    def prefix_match(self, word):
        if 0 == len(word):
            return False
        cur = self
        for ch in word[:-1]: # check all symbols but last one
            # If you check the last one, you are not checking a prefix,
            # you are checking whether the whole word is in the trie.
            if ch in cur.w:
                return True
            if ch not in cur.t:
                return False
            cur = cur.t[ch]  # walk down the trie to next level
        return False

    def debug_str(self, nest, s=None):
        "print trie in a convenient nested format"
        lst = []
        s_term = "".join(ch for ch in self.w)
        if 0 == nest:
            lst.append(object.__str__(self))
            lst.append("--top--: " + s_term)
        else:
            tup = (_pad(nest), s, s_term)
            lst.append("%s%s: %s" % tup)
        for ch, d in self.t.items():
            lst.append(d.debug_str(nest+1, ch))
        return "\n".join(lst)

    def __str__(self):
        return self.debug_str(0)



t = Trie()


# Build valid list from /usr/dict/share/words, which has every letter of
# the alphabet as words!  Only take 2-letter words and longer.

wfile = "/usr/share/dict/words"
for line in open(wfile):
    word = line.strip()
    if len(word) >= 2:
        t.add(word)

# add valid 1-letter English words
t.add("a")
t.add("I")



lst = ["ark", "booze", "kite", "live", "rodeo"]
# "ark" starts with "a"
# "booze" starts with "boo"
# "kite" starts with "kit"
# "live" is good: "l", "li", "liv" are not words
# "rodeo" starts with "rode"

newlst = [w for w in lst if not t.prefix_match(w)]

print(newlst)  # prints: ['live']
+1
0

- , .

, O (n log N) O (M) , M - . .

l = sorted(your_list)
removed_prefixes = [l[g] for g in range(0, len(l)-1) if not l[g+1].startswith(l[g])] + l[-1:]
  • , N , N + 1.

  • , . .

, :

 banned = tuple(banned_prefixes]
 removed_prefixes = [ i for i in your_list if not i.startswith(banned)]

It depends on what startswith accepts the tuple. It probably works in something close to N * M, where N are the items in the list and M are the items in banned. Perhaps Python could do some clever things to make it a little faster. If you look like an OP and want to ignore the case, you will need calls .lower()in places.

0
source

Source: https://habr.com/ru/post/1787080/


All Articles