F # How to tokenize user input: separation of numbers, units, words?

I am new to F # but have spent the past few weeks reading reference materials. I want to process a user input line by identifying and separating the constituent elements. For example, for this input:

XYZ Hotel: 6 nights at 220EUR / night plus 17.5% tax

the output should resemble something like a list of tuples:

[("XYZ", Word); ("Hotel:", Word),
("6", "Room"); ("nights", "Word"),
("at", Operator); ("220", number),
("EUR", CurrencyCode); ("/", Operator); ("night", "Word"),
("plus", "Operator"); ("17.5", Number); ( "%", Percent); ("Tax", Word)]

Since I'm dealing with user input, it could be anything. Thus, the expectation that users abide by grammar is out of the question. I want to identify numbers (there may be integers, floats, negative ...), units of measurement (optional, but may include SI or imperial physical units, currency codes such as "night / s" in my example), mathematical operators (like mathematical symbols or words, including "by", "for", "from", "discount", etc.) and all other words.

I got the impression that I should use active pattern matching - is that right? - but I'm not quite sure how to start. Any pointers to relevant reference material or similar examples would be great.

+3
2

, FParsec. , , FParsec.

type Element =
| Word of string
| Number of string
| Operator of string
| CurrencyCode of string
| PerCent  of string    

let parsePerCent state =
    (parse {
        let! r = pstring "%"
        return PerCent r
    }) state

let currencyCodes = [|
    pstring "EUR"
|]

let parseCurrencyCode state =
    (parse {
        let! r = choice currencyCodes
        return CurrencyCode r
    }) state

let operators = [|
    pstring "at"
    pstring "/"
|]

let parseOperator state =
    (parse {
        let! r = choice operators
        return Operator r
    }) state

let parseNumber state =
    (parse {
        let! e1 = many1Chars digit
        let! r = opt (pchar '.')
        let! e2 = manyChars digit
        return Number (e1 + (if r.IsSome then "." else "") + e2)
    }) state

let parseWord state =
    (parse {
        let! r = many1Chars (letter <|> pchar ':')
        return Word r
    }) state

let elements = [| 
    parseOperator
    parseCurrencyCode
    parseWord
    parseNumber 
    parsePerCent
|]

let parseElement state =
    (parse {
        do! spaces
        let! r = choice elements
        do! spaces
        return r
    }) state

let parseElements state =
    manyTill parseElement eof state

let parse (input:string) =
    let result = run parseElements input 
    match result with
    | Success (v, _, _) -> v
    | Failure (m, _, _) -> failwith m
+5

, . FSParsec FSLex. ( , , .) FSLex, :

XYZ Hotel: 6 nights at 220EUR / night plus 17.5% tax

- :

 [ Word("XYZ"); Hotel; Int(6); Word("nights"); Word("at"); Int(220); EUR; ... ]

, , , / (, , , ). , :

let rec processTokenList tokens = 
    match tokens with
    | Float(x) :: Keyword("EUR") :: rest  -> // Dollar amount x
    | Word(x) :: Keyword("Hotel") :: rest -> // Hotel x
    | hd :: rest -> // Couldn't find anything interesting...
                    processTokenList rest

. , , "", . ( , !)

+1

Source: https://habr.com/ru/post/1784646/


All Articles