The solution is to test a series of token / non-token units, instead of trying to check the correct number of split units or the correct number of tokens. If the token is a separator and you want the minimum number of divided units, the number of token / non-token units equal to one is required, less than the required number of units. As we will see, this solution has amazing performance.
Minimum row count
This function checks the minimum number of lines, where \n restricts lines, not limits them, which allows you to use the empty last line:
function hasMinLineCount(text, minLineCount) { if (minLineCount <= 1) return true; // always 1+ lines, though perhaps empty var r = new RegExp('([^\n]*\n){' + (minLineCount-1) + '}'); return r.test(text); }
Alternatively, \n can be considered the end lines, rather than just restricting them, making an exception for a non-empty last line. For example, "apple\npear\n" will have two lines, and "apple\npear\ngrape" will have three. The following function counts the lines as follows:
function hasMinLineCount(text, minLineCount) { var r = new RegExp('([^\n]*\n|[^\n]+$){' + minLineCount + '}'); return r.test(text); }
String delimiters and tokens
More generally, for any unit limited to a line separator:
var _ = require('lodash'); function hasMinUnitCount(text, minUnitCount, unitDelim) { if (minUnitCount <= 1) return true; // always 1+ units, though perhaps empty var escDelim = _.escapeRegExp(unitDelim); var r = new RegExp('(.*?'+ escDelim +'){' + (minUnitCount-1) + '}'); return r.test(text); }
We can also check for a minimum number of string tokens:
var _ = require('lodash'); function hasMinTokenCount(text, minTokenCount, token) { var escToken = _.escapeRegExp(token); var r = new RegExp('(.*?'+ escToken +'){' + minTokenCount + '}'); return r.test(text); }
Regular Expression Limiters and Tokens
We can further generalize by allowing delimiters and unit tokens to include regex characters. Just make sure that the dividers or tokens can uniquely meet each other. Examples of regular expression delimiters include "<br */>" and "[|,]" . These are strings, not RegExp objects.
function hasMinUnitCount(text, minUnitCount, unitDelimRegexStr) { if (minUnitCount <= 1) return true; // always 1+ units, though perhaps empty var r = new RegExp( '(.*?'+ unitDelimRegexStr +'){' + (minUnitCount-1) + '}'); return r.test(text); } function hasMinTokenCount(text, minTokenCount, tokenRegexStr) { var r = new RegExp('(.*?'+ tokenRegexStr +'){' + minTokenCount + '}'); return r.test(text); }
Estimated costs
Common functions work because their regular expressions do not have greedy character matching (note .*? ) Until the next delimiter or token. This is a computationally expensive forward and backward search process, so they require improved performance compared to more rigidly expressed expressions, such as those found in hasMinLineCount() above.
Let's get back to the original question of whether we can beat line breaks by testing regular expressions. Recall that our only goal is to check the minimum number of rows. I used benchmark.js for verification, and I assumed that we know that many lines are required. Here is the code:
var Benchmark = require('benchmark'); var suite = new Benchmark.Suite; var line = "Go faster faster faster!\n"; var text = line.repeat(100); var MIN_LINE_COUNT = 50; var preBuiltBackingRegex = new RegExp('(.*?\n){'+ MIN_LINE_COUNT +'}'); var preBuiltNoBackRegex = new RegExp('([^\n]*\n){'+ MIN_LINE_COUNT +'}'); suite.add('split string', function() { if (text.split("\n").length >= MIN_LINE_COUNT) 'has minimum lines'; }) .add('backtracking on-the-fly regex', function() { if (new RegExp('(.*?\n){'+ MIN_LINE_COUNT +'}').test(text)) 'has minimum lines'; }) .add('backtracking pre-built regex', function() { if (preBuiltBackingRegex.test(text)) 'has minimum lines'; }) .add('no-backtrack on-the-fly regex', function() { if (new RegExp('([^\n]*\n){'+ MIN_LINE_COUNT +'}').test(text)) 'has minimum lines'; }) .add('no-backtrack pre-built regex', function() { if (preBuiltNoBackRegex.test(text)) 'has minimum lines'; }) .on('cycle', function(event) { console.log(String(event.target)); }) .on('complete', function() { console.log('Fastest is ' + this.filter('fastest').map('name')); }) .run({ 'async': true });
The following are the results of three runs:
split string x 263,260 ops/sec ±0.68% (85 runs sampled) backtracking on-the-fly regex x 492,671 ops/sec ±1.01% (82 runs sampled) backtracking pre-built regex x 607,033 ops/sec ±0.72% (87 runs sampled) no-backtrack on-the-fly regex x 581,681 ops/sec ±0.77% (84 runs sampled) no-backtrack pre-built regex x 723,075 ops/sec ±0.72% (89 runs sampled) Fastest is no-backtrack pre-built regex split string x 260,962 ops/sec ±0.82% (85 runs sampled) backtracking on-the-fly regex x 502,410 ops/sec ±0.79% (84 runs sampled) backtracking pre-built regex x 606,220 ops/sec ±0.67% (88 runs sampled) no-backtrack on-the-fly regex x 578,193 ops/sec ±0.83% (86 runs sampled) no-backtrack pre-built regex x 741,864 ops/sec ±0.68% (84 runs sampled) Fastest is no-backtrack pre-built regex split string x 262,266 ops/sec ±0.76% (87 runs sampled) backtracking on-the-fly regex x 495,697 ops/sec ±0.82% (87 runs sampled) backtracking pre-built regex x 608,178 ops/sec ±0.72% (88 runs sampled) no-backtrack on-the-fly regex x 574,640 ops/sec ±0.92% (87 runs sampled) no-backtrack pre-built regex x 739,629 ops/sec ±0.72% (86 runs sampled) Fastest is no-backtrack pre-built regex
All regex tests are clearly faster than line splitting to check the number of lines, even reverse ones. I think I will do regular testing.