As noted in the comments, the NP-hard Set Cover problem is a special case of this problem, where the minimum frequency is k = 1 , which makes this an NP-hard problem as well. I would recommend a library such as PuLP with the following integer Program.
minimize sum over tuples T of x(T) subject to y(e): for all elements e, (sum over tuples T of (count of e in T) * x(T)) >= k z(T): for all tuples T, x(T) in {0, 1}
The only drawback of PuLP is that it requires an external solver. I was in the mood for a hack, however, so I wrote a (very easily verified) clean Python Solver. It uses a depth search with the best first backtrace, with a simple propagation strategy, to determine which tuples should or should not be selected and a heuristic function based on primary-dual approaching the next dual previous program (so its a complicated toy, but still a toy).
maximize (sum over elements e of k * y(e)) - (sum over tuples T of z(T)) subject to x(T): for all tuples T, (sum over elements e in T of y(e)) - z(T) <= 1 for all elements e, y(e) >= 0 for all tuples T, z(T) >= 0
The initial double strategy is to increase the y values that increase at the same speed, without requiring an unprofitable corresponding increase in z .
from collections import Counter, defaultdict, namedtuple from fractions import Fraction from heapq import heappop, heappush from math import ceil from operator import itemgetter class _BestFirstSearchDepthFirstBacktracking: def optimize(self): node = self._make_root_node() heap = [] upper_bound = None while True: lower_bound = ceil(node.lower_bound) if upper_bound is None or lower_bound < upper_bound: child_nodes = list(self._make_child_nodes(node)) if child_nodes: i, node = min(enumerate(child_nodes), key=itemgetter(1)) del child_nodes[i] for child_node in child_nodes: heappush(heap, child_node) continue upper_bound = lower_bound solution = node if not heap: return (upper_bound, solution) node = heappop(heap) Node = namedtuple('Node', ('lower_bound', 'index', 'maybes', 'yeses', 'variable')) class UnsolvableException(Exception): pass class _Optimizer(_BestFirstSearchDepthFirstBacktracking): def __init__(self, tuples, min_freq): self._index = 0 self._tuples = set(tuples) self._min_freq = min_freq self._elements = set() for t in self._tuples: self._elements.update(t) def _propagate(self, maybes, yeses): upper_count = Counter() for t in maybes: upper_count.update(t) for t in yeses: upper_count.update(t) if any(upper_count[e] < self._min_freq for e in self._elements): raise UnsolvableException() forced_yeses = set() forced_yeses = {t for t in maybes if any(upper_count[e] - k < self._min_freq for e, k in Counter(t).items())} maybes = maybes - forced_yeses yeses = yeses | forced_yeses lower_count = Counter() for t in yeses: lower_count.update(t) residual = {e for e in self._elements if lower_count[e] < self._min_freq} maybes = {t for t in maybes if any(e in residual for e in t)} return (maybes, yeses) def _compute_heuristic(self, maybes, yeses): lower_count = Counter() for t in yeses: lower_count.update(t) residual_count = {e: max(self._min_freq - lower_count[e], 0) for e in self._elements} y = defaultdict(int) z = defaultdict(int) variable = None while True: slack = {t: 1 + z[t] - sum(y[e] for e in t) for t in maybes} assert all(s >= 0 for s in slack.values()) inactive_maybes = {t for t, s in slack.items() if s > 0} if not inactive_maybes: break active_maybes = {t for t, s in slack.items() if s == 0} active_count = Counter() for t in active_maybes: active_count.update(t) dy = {e: 1 for e, k in residual_count.items() if active_count[e] < k} if not dy: break delta_inverse, variable = max(((Fraction(sum(dy.get(e, 0) for e in t), slack[t]), t) for t in inactive_maybes), key=itemgetter(0)) delta = Fraction(1, delta_inverse) for e, dy_e in dy.items(): y[e] += delta * dy_e for t in active_maybes: z[t] += delta * sum(dy.get(e, 0) for e in t) return (sum(residual_count[e] * y_e for e, y_e in y.items()) - sum(z.values()), variable) def _make_node(self, maybes, yeses): maybes, yeses = self._propagate(maybes, yeses) heuristic, variable = self._compute_heuristic(maybes, yeses) node = Node(len(yeses) + heuristic, self._index, maybes, yeses, variable) self._index += 1 return node def _make_root_node(self): return self._make_node(self._tuples, set()) def _make_child_nodes(self, node): if node.variable is None: return variable = {node.variable} maybes = node.maybes - variable yield self._make_node(maybes, node.yeses) yield self._make_node(maybes, node.yeses | variable) def optimize(tuples, min_freq): optimizer = _Optimizer(tuples, min_freq) node = optimizer.optimize()[1] print('Nodes examined:', optimizer._index) return node.yeses print(optimize({(2,), (3,), (1, 4), (1, 2, 3), (2, 3), (3, 4), (2, 4)}, 2)) print(optimize({(1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14), (1, 2, 3, 4, 8, 9, 10, 11), (5, 6, 12, 13), (7, 14)}, 1))