How can I track character positions after removing items from a string?

Let's say I have the following line:

 "my ., .,dog. .jumps. , .and..he. .,is., .a. very .,good, .dog"  
  1234567890123456789012345678901234567890123456789012345678901 <-- char pos

Now I wrote a regular expression to remove certain elements from the line above, in this example, all spaces, all periods and all commas.

I stay with the following converted string:

 "mydogjumpsandheisaverygooddog"

Now I want to build k-grams of this line. Let's say I had to take 5 grams of the specified string, it would look like this:

  mydog ydogj dogju ogjum gjump jumps umpsa ...

The problem is that for each k-graph I want to track its initial character position in the first source text I listed.

, "mydog", "0" "11". . , , k- , . , .

k-, :

public class Kgram
{
    public int start;  
    public int end;  
    public int text;  
}

start end - (), - k- .

- , ?

+3
4

"replace" API . , , , . , , , "n chars deleted" , .

. . , , , .

+5

Haskell:

kgramify k string =
  let charsWithPos = zip string [1..]  -- attach original position to each char
      goodCWP      = filter (not o isWhitePeriodOrComma o fst) charsWithPos -- drop nasty chars
      groups       = takeEveryK k goodCWP -- clump remaining chars in groups of size k
      posnOfGroup g = (snd (head g), map fst g) -- position of first char with group
  in  map posnOfGroup groups

:

  • (, )
  • k
  • ( ).

, Clean, Haskell, ML Scheme, . (new) , , malloc free, .

+5

C, , , , . , , 0 filterdata not_wspc :

int not_wspc(void *, char c) {
    if isspace((unsigned char)c) return 0;
    if ((c == '.') || (c == ',')) return 0;
    return 1;
}

typedef struct {
    char c;
    int pos;
} charwithpos;

KGram *foo(const char *input, int (*filter)(void *,char), void *filterdata) {
    size_t len = strlen(input);
    charwithpos *filtered = malloc(len * sizeof(*filtered));
    assert(filtered);

    // combine Norman zip and filter steps
    charwithpos *current = filtered
    for (size_t i = 0; i < len; ++i) {
        if (filter(filterdata, input[i])) {
            current->c = input[i];
            current->pos = i;
            ++current;
        }
    }
    size_t shortlen = (current - filtered);

    // wouldn't normally recommend returning malloced data, but
    // illustrates the point.
    KGram *result = malloc((shortlen / 5 + 1) * sizeof(*result));
    assert(result);

    // take each 5 step
    KGram *currentgram = result;
    current = filtered;
    for (size_t i = 0; i < shortlen; ++i) {
        currentgram->text[i%5] = current->c;
        if ((i % 5) == 0) {
            currentgram->start = current->pos;
        } else if ((i % 5) == 4) {
            currentgram->end = current->pos;
            ++currentgram;
        }
        ++current;
    }
    if (shortlen % 5) != 0 {
        currentgram->end = filtered[shortlen-1].pos;
        currentgram->text[shortlen%5] = 0;
    }

    free(filtered);
    return(result);
}

- , . , , filtered , , . , , . , C, , .

++-, <functional>. , new: , , , ; -)

template <typename OutputIterator>
struct KGramOutput {
    OutputIterator dest;
    KGram kgram;
    KGramOutput(OutputIterator dest) : dest(dest) {}
    void add(char, size_t);
    void flush(void);
};

template <typename InputIterator, typename OutputIterator, typename Filter>
void foo(InputIterator first, InputIterator last, OutputIterator dest, Filter filter) {
    size_t i = 0;
    KGramOutput<OutputIterator> kgram(dest);
    while (first != last) {
        if (filter(*first)) kgram.add(*first, i);
        ++first;
        ++i;
    }
    kgram.flush();
}

Functions addand a flushlittle tedious, they have to group 5 pairs into a KGram structure, and then do it *dest++ = kgram. The user could pass, for example, pushback_iteratorover vector<KGram>as an output iterator. Btw '5' and 'char' can also be template parameters.

+2
source

This can be done in one pass without the need to create intermediate pairs of characters:

(defclass k-gram ()
  ((start :reader start :initarg :start)
   (end :accessor end)
   (text :accessor text)))

(defmethod initialize-instance :after ((k-gram k-gram) &rest initargs &key k)
  (declare (ignorable initargs))
  (setf (slot-value k-gram 'text) (make-array k :element-type 'character)))

(defun k-gramify (string k ignore-string)
  "Builds the list of complete k-grams with positions from the original
   text, but with all characters in ignore-string ignored."
  (loop
     for character across string
     for position upfrom 0
     with k-grams = ()
     do (unless (find character ignore-string)
          (push (make-instance 'k-gram :k k :start position) k-grams)
          (loop
             for k-gram in k-grams
             for i upfrom 0 below k
             do (setf (aref (text k-gram) i) character
                      (end k-gram) (1+ position))))
     finally (return (nreverse (nthcdr (- k 1) k-grams)))))
+1
source

Source: https://habr.com/ru/post/1733704/


All Articles