Reduces preset size while maintaining minimum frequency

Suppose I have the following set:

{(2,), (3,), (1, 4), (1, 2, 3), (2, 3), (3, 4), (2, 4)} 

This gives each number the following frequency:

 2: 4, 3: 4, 4: 3, 1: 2 

Can you suggest a way to reduce the set so that each number exists in it at least 2 times, but where the number of tuples in the set is minimized?

For example, tuple (3, 4) can be removed from the set by setting these frequencies:

 2: 4, 3: 3, 4: 2, 1: 2 

Here is my very weak attempt to solve this:

 def reduce(a, limit): while True: remove = None for i in a: c = Counter([i for s in a for i in s]) if c.most_common(1)[0][0] in i: if min([c[j] for j in i]) > limit: remove = i break if remove: a.remove(remove) else: break reduce(a, 2) # we want at least two of each number 

The problem with this solution is that it may well reduce the set, but not necessarily such that I stay with the smallest possible set.

In my specific example, the set that I want to reduce contains strings, let's say something like this:

a = [("one","eighty one","three"), ("eighty five","sixty one","three", "eleven"), ...] where the length of a is 1000. The length of each a tuple in a is 3 to 9. There are 100 unique values ​​that can consist of tuples, for example, “one” is one of those values. I want each unique value to be represented at least 25 times after I have reduced the set. How long does it take for a computer to calculate abbreviated dialing? Are we talking a few seconds or minutes?

+6
source share
4 answers

As noted in the comments, the NP-hard Set Cover problem is a special case of this problem, where the minimum frequency is k = 1 , which makes this an NP-hard problem as well. I would recommend a library such as PuLP with the following integer Program.

 minimize sum over tuples T of x(T) subject to y(e): for all elements e, (sum over tuples T of (count of e in T) * x(T)) >= k z(T): for all tuples T, x(T) in {0, 1} 

The only drawback of PuLP is that it requires an external solver. I was in the mood for a hack, however, so I wrote a (very easily verified) clean Python Solver. It uses a depth search with the best first backtrace, with a simple propagation strategy, to determine which tuples should or should not be selected and a heuristic function based on primary-dual approaching the next dual previous program (so its a complicated toy, but still a toy).

 maximize (sum over elements e of k * y(e)) - (sum over tuples T of z(T)) subject to x(T): for all tuples T, (sum over elements e in T of y(e)) - z(T) <= 1 for all elements e, y(e) >= 0 for all tuples T, z(T) >= 0 

The initial double strategy is to increase the y values ​​that increase at the same speed, without requiring an unprofitable corresponding increase in z .

 from collections import Counter, defaultdict, namedtuple from fractions import Fraction from heapq import heappop, heappush from math import ceil from operator import itemgetter class _BestFirstSearchDepthFirstBacktracking: def optimize(self): node = self._make_root_node() heap = [] upper_bound = None while True: lower_bound = ceil(node.lower_bound) if upper_bound is None or lower_bound < upper_bound: child_nodes = list(self._make_child_nodes(node)) if child_nodes: i, node = min(enumerate(child_nodes), key=itemgetter(1)) del child_nodes[i] for child_node in child_nodes: heappush(heap, child_node) continue upper_bound = lower_bound solution = node if not heap: return (upper_bound, solution) node = heappop(heap) Node = namedtuple('Node', ('lower_bound', 'index', 'maybes', 'yeses', 'variable')) class UnsolvableException(Exception): pass class _Optimizer(_BestFirstSearchDepthFirstBacktracking): def __init__(self, tuples, min_freq): self._index = 0 self._tuples = set(tuples) self._min_freq = min_freq self._elements = set() for t in self._tuples: self._elements.update(t) def _propagate(self, maybes, yeses): upper_count = Counter() for t in maybes: upper_count.update(t) for t in yeses: upper_count.update(t) if any(upper_count[e] < self._min_freq for e in self._elements): raise UnsolvableException() forced_yeses = set() forced_yeses = {t for t in maybes if any(upper_count[e] - k < self._min_freq for e, k in Counter(t).items())} maybes = maybes - forced_yeses yeses = yeses | forced_yeses lower_count = Counter() for t in yeses: lower_count.update(t) residual = {e for e in self._elements if lower_count[e] < self._min_freq} maybes = {t for t in maybes if any(e in residual for e in t)} return (maybes, yeses) def _compute_heuristic(self, maybes, yeses): lower_count = Counter() for t in yeses: lower_count.update(t) residual_count = {e: max(self._min_freq - lower_count[e], 0) for e in self._elements} y = defaultdict(int) z = defaultdict(int) variable = None while True: slack = {t: 1 + z[t] - sum(y[e] for e in t) for t in maybes} assert all(s >= 0 for s in slack.values()) inactive_maybes = {t for t, s in slack.items() if s > 0} if not inactive_maybes: break active_maybes = {t for t, s in slack.items() if s == 0} active_count = Counter() for t in active_maybes: active_count.update(t) dy = {e: 1 for e, k in residual_count.items() if active_count[e] < k} if not dy: break delta_inverse, variable = max(((Fraction(sum(dy.get(e, 0) for e in t), slack[t]), t) for t in inactive_maybes), key=itemgetter(0)) delta = Fraction(1, delta_inverse) for e, dy_e in dy.items(): y[e] += delta * dy_e for t in active_maybes: z[t] += delta * sum(dy.get(e, 0) for e in t) return (sum(residual_count[e] * y_e for e, y_e in y.items()) - sum(z.values()), variable) def _make_node(self, maybes, yeses): maybes, yeses = self._propagate(maybes, yeses) heuristic, variable = self._compute_heuristic(maybes, yeses) node = Node(len(yeses) + heuristic, self._index, maybes, yeses, variable) self._index += 1 return node def _make_root_node(self): return self._make_node(self._tuples, set()) def _make_child_nodes(self, node): if node.variable is None: return variable = {node.variable} maybes = node.maybes - variable yield self._make_node(maybes, node.yeses) yield self._make_node(maybes, node.yeses | variable) def optimize(tuples, min_freq): optimizer = _Optimizer(tuples, min_freq) node = optimizer.optimize()[1] print('Nodes examined:', optimizer._index) return node.yeses print(optimize({(2,), (3,), (1, 4), (1, 2, 3), (2, 3), (3, 4), (2, 4)}, 2)) print(optimize({(1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14), (1, 2, 3, 4, 8, 9, 10, 11), (5, 6, 12, 13), (7, 14)}, 1)) 
+7
source

Here is a quick and dirty method. Hope you go.

Unfortunately, obtaining the exact smallest set of results is not guaranteed. First, he gets rid of small tuples. So if there are smaller tuples and fewer tuples, this might work for you.

It also starts as an ordered set (list), but is not restored. It is necessary to be ordered, at least in function, therefore the calculated values ​​correlate correctly. I would like to clear it and refactor, but it is late.

 def reduce(source, min_count=2): print "source: {}".format(source) # [(2,), (3,), (1, 4), (1, 2, 3), (2, 3), (3, 4), (2, 4)] answer = [] freq = {} lens = [] for t in source: lens.append(len(t)) for i in t: freq[i] = freq.get(i, 0) + 1 print "freq: {}".format(freq) # {1: 2, 2: 4, 3: 4, 4: 3} print "lens: {}".format(lens) # [1, 1, 2, 3, 2, 2, 2] from collections import defaultdict slens = defaultdict(list) for l, t in zip(lens, source): slens[l].append(t) print "slens: {}".format(slens) # {1: [(2,), (3,)], 2: [(1, 4), (2, 3), (3, 4), (2, 4)], 3: [(1, 2, 3)]} for l in sorted(slens.keys()): for t in slens[l]: save = False for i in t: if (freq[i] <= min_count): save = True freq[i] -= 1 if save: answer.append(t) print "answer: {}".format(answer) # [(1, 4), (1, 2, 3), (3, 4), (2, 4)] freq = {} for t in answer: for i in t: freq[i] = freq.get(i, 0) + 1 print "freq: {}".format(freq) # {1: 2, 2: 2, 3: 2, 4: 3} 

My initial thought was to iterate over, keep any tuples below min_count and reduce the working set. Then, remember the remaining tuples, where lower frequency elements are counted for more. Then discard the lower scoring tuples, which will not reduce the frequency of any component below min_count when removed. Then recount the frequencies and start.

+2
source

The problem of at least NP is tough, which means that you cannot find an efficient algorithm (polynomial time). However, there are ways to reduce permanent time factors. In addition to using better algorithms, consider using faster work such as PyPy.

The following code, if executed before completion, will return the subset of the smallest possible size. In addition, it only takes into account the allowable input and can gradually increase the small covering subsets.

 from collections import defaultdict from itertools import product, combinations def covering_set(sets, min_freq, print_approximations=False): # dictionary mapping each unique value to the sets that contain it d = defaultdict(list) for set_ in sets: for elem in set_: d[elem].append(set_) # we need min_freq number of each unique values combos = [combinations(x, min_freq) for x in d.values()] #initial solution min_cover = sets min_length = len(sets) #iterate through valid solutions #cover is a list of list of sets for cover in product(*combos): #we must flatten and remove the duplicates in the cover covering_set = set() for elem_cover in cover: for set_ in elem_cover: if set_ not in covering_set: covering_set.add(set_) #now, we check if it the smallest current solution if len(covering_set) < min_length: min_cover = covering_set min_length = len(covering_set) if print_approximations: print(min_length, min_cover) return min_cover 
+1
source

So here is my solution. I saw that you used a maximum of 1000 elements, so I decided to implement the algorithm in recursive mode.

First of all, we define a function that receives the frequency of each number in tuples:

 def get_frequency(tuple_list): frequency = {} def mapper(element): if frequency.has_key(element): frequency[element] += 1 else: frequency[element] = 1 map(lambda t: map(mapper, t), tuple_list) return frequency 

This is relatively easy to say, so I would not have spent much time on it. After that, I decided to implement a core function called recursive . This function returns a tuple consisting of a list of elements that can be removed and the maximum depth that the algorithm can execute.

This is a preliminary algorithm that I wrote before implementation:

 if tuple_list is null : return ([], iteration) best_deletion = None for elements: if element can't be deleted : continue launch the next recursion without the current element in the list if the iteration is better than best_iteration or best_iteration is None : set the result of recursion in best_deletion if best_deletion is None : return ([], iteration) return the best_iteration with adding the current Node inside, and increment the iteration 

Here is the result:

 def recursive(tuple_list, limit, iteration): if tuple_list == []: return ([], iteration) frequency = get_frequency(tuple_list) value = None for i in xrange(len(tuple_list)): impossible_to_delete = False for number in tuple_list[i]: frequency[number] -= 1 if frequency[number] < limit: impossible_to_delete = True break if impossible_to_delete: continue next_recursion_list = tuple_list[:] next_recursion_list.pop(i) maximum_deletion = recursive(next_recursion_list, limit, iteration + 1) if value == None: maximum_deletion[0].insert(0, tuple_list[i]) value = (maximum_deletion[0], maximum_deletion[1] + 1) else: if value[1] < maximum_deletion[1]: maximum_deletion[0].insert(0, tuple_list[i]) value = (maximum_deletion[0], maximum_deletion[1] + 1) if value == None: return ([], iteration) return value 

After that, just call this function:

 items_to_delete = recursive(list(tuple_set), 2, 0) 

Hope this helps. I will check which of the use-case algorithms is the fastest if I got some time

+1
source

Source: https://habr.com/ru/post/975173/


All Articles