Matching the selected option to a specific html tag in java Regex

I need to parse some html to find a set of values ​​from some HTML that is not always well formed and I have no control (so the scanner does not seem to be an option)

This is the shopping basket, and in the basket is the number of rows, each of which contains a drop-down quantity. Now I want to get the total amount of products in the basket.

Given this html, I would like to match the values ​​2 and 5

...
<select attr="other stuff" name="quantity">
    <option value="1" />
    <option value="2" selected="selected" />
</select>
....
<select name="quantity" attr="other stuff">
    <option selected="selected" value="5" />
    <option value="6" />
</select>

I made some pathetic attempts, but given the number of variables (for example, the order of the "value" and "selected" tags), most of my solutions either don't work or are very slow.

The last Java code I ended up with is the following

Pattern pattern = Pattern.compile("select(.*?)name=\"quantity\"([.|\\n|\\r]*?)option(.*?)value=\"(/d)\" selected=\"selected\"", Pattern.DOTALL);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
   ....
}

, . Regex

+4
4

XPath HTML, :

//select[@name="quantity"]/option[@selected="selected"]/@value

:

  • <select> XML name, quantity, <option> selected, selected
  • value.

XQuery/XPath, . XML XPath Java , . XPath .


, , selected="selected" , status="accepted". XPath :

//select[@name="quantity"]/option[@selected="selected" and @status="accepted"]/@value

XPath , , , .

, RegEx ? , . - , (cf bobble bubble answer) ? , , ?

, , . .

: , .


, :

import java.io.StringReader;
import javax.xml.xpath.*;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public class ReadElementsFromHtmlUsingXPath {
    private static final String html=
"<html>Read more about XPath <a href=\"www.w3schools.com/xsl/xpath_intro.asp\">here</a>..."+
"<select attr=\"other stuff\" name=\"quantity\">"+
    "<option value=\"1\" />"+
    "<option value=\"2\" selected=\"selected\" />"+
"</select>"+
"<i><b>Oh and here the second element</b></i>"+
"<select name=\"quantity\" attr=\"other stuff\">"+
    "<option selected=\"selected\" value=\"5\" />"+
    "<option value=\"6\" />"+
"</select>"+
"And that all folks</html>";

    private static final String xpathExpr = 
"//select[@name=\"quantity\"]/option[@selected=\"selected\"]/@value";

    public static void main(String[] args) {
        try {
            XPath xpath = XPathFactory.newInstance().newXPath();
            XPathExpression expr = xpath.compile(xpathExpr);
            NodeList nodeList = (NodeList) expr.evaluate(new InputSource(new StringReader(html)),XPathConstants.NODESET);
            for( int i = 0; i != nodeList.getLength(); ++i )
                System.out.println(nodeList.item(i).getNodeValue());
        } catch (XPathExpressionException e) {
            e.printStackTrace();
        }
    }
}

:

2
5
+4

, , HTML-. Parser solution, .

, , , .

(?xi) # i-flag for caseless, x-flag for comments (free spacing mode) 

# 1.) match <select with optional space at the end
<\s*select\s[^>]*?\bname\s*=\s*["']\s*quantity[^>]*>\s*

# 2.) match lazily any amount of options until the "selected"
(?:<\s*option[^>]*>\s*)*?

# 3.) match selected using a lookahead and capture number from value
<\s*option\s(?=[^>]*?\bselected)[^>]*?\bvalue\s*=\s*["']\s*(\d[.,\d]*)

regex101 RegexPlanet (Java) Java:

"(?i)<\\s*select\\s[^>]*?\\bname\\s*=\\s*[\"']\\s*quantity[^>]*>\\s*(?:<\\s*option[^>]*>\\s*)*?<\\s*option\\s(?=[^>]*?\\bselected)[^>]*?\\bvalue\\s*=\\s*[\"']\\s*(\\d[.,\\d]*)"

. - html.

  • \s short [ \t\r\n\f]
  • \d [0-9]
  • \b
  • (?:
  • [^>] > ( , >)
  • (?=[^>]*?\bselected) lookahead
  • (\d[.,\d]*) . [.,\d]

group(1) ( ).

+3

Divide and Conquer.

Option:

public class Option {

    private String value;
    private boolean selected;

    public Option() {
    }

    public Option(String value, boolean selected) {
        this.value = value;
        this.selected = selected;
    }

    public String getValue() {
        return value;
    }

    public void setValue(String value) {
        this.value = value;
    }

    public boolean isSelected() {
        return selected;
    }

    public void setSelected(boolean selected) {
        this.selected = selected;
    }

    @Override
    public String toString() {
        return "{" +
                "value='" + value + '\'' +
                ", selected=" + selected +
                '}';
    }

}

second, regex html-:

static final Pattern OPTION_TAG_PATTERN = Pattern.compile("<option\\s*(value=\"\\w+\"\\s+(?:selected=\"selected\")?|(?:selected=\"selected\")?\\s+value=\"\\w+\")\\s*/>");

value:

static final Pattern VALUE_PATTERN = Pattern.compile("value=\"(\\w+)\"");

, :

public class Test {

    private static final Pattern OPTION_TAG_PATTERN = Pattern.compile("<option\\s*(value=\"\\w+\"\\s+(?:selected=\"selected\")?|(?:selected=\"selected\")?\\s+value=\"\\w+\")\\s*/>");
    private static final Pattern VALUE_PATTERN = Pattern.compile("value=\"(\\w+)\"");

    public static void main(String[] args) {
        String html = "...\n" +
                "<select attr=\"other stuff\" name=\"quantity\">\n" +
                "    <option value=\"1\" />\n" +
                "    <option value=\"2\" selected=\"selected\" />\n" +
                "</select>\n" +
                "....\n" +
                "<select name=\"quantity\" attr=\"other stuff\">\n" +
                "    <option selected=\"selected\" value=\"5\" />\n" +
                "    <option value=\"6\" />\n" +
                "</select>";
        findOptions(html).forEach(System.out::println);
    }

    public static List<Option> findOptions(String htmlContent) {
        List<Option> options = new ArrayList<>();
        Matcher optionMatcher = OPTION_TAG_PATTERN.matcher(htmlContent);
        while (optionMatcher.find()) {
            options.add(toOption(htmlContent.substring(optionMatcher.start(), optionMatcher.end())));
        }
        return options;
    }

    private static Option toOption(String htmlTag) {
        Option option = new Option();
        Matcher valueMatcher = VALUE_PATTERN.matcher(htmlTag);
        if (valueMatcher.find()) {
            option.setValue(valueMatcher.group(1));
        }
        if (htmlTag.contains("selected=\"selected\"")) {
            option.setSelected(true);
        }
        return option;
    }

}

:

{value='1', selected=false}
{value='2', selected=true}
{value='5', selected=true}
{value='6', selected=false}

, !

+2
source

I believe regex is not suitable for this, simply because complexity makes reading and diagnosing code difficult. We can still use the regex, but break the logic to make it easier to read and improve:

String html = "<select attr=\"other stuff\" name=\"quantity\">" +
"<option value=\"1\" /> " +
"<option value=\"2\" selected=\"selected\" /> " +
"</select> " +
"<select name=\"quantity\" attr=\"other stuff\"> " + 
"<option selected=\"selected\" value=\"5\" /> " +
"<option value=\"6\" /> " + "</select>";
String options = "(?<=<option).*?(?=/>)";
Pattern pat = Pattern.compile(options, Pattern.DOTALL);
Matcher m = pat.matcher(html);
Pattern values = Pattern.compile("(?<=value=\").*?(?=\")");
Pattern selected = Pattern.compile("selected=\"selected\"");
Integer counter = 0;
while (m.find()) {
    Matcher sel = selected.matcher(m.group());
    if (sel.find()) {
        Matcher val = values.matcher(m.group());
        if (val.find()) {
            Integer count = Integer.parseInt(val.group());
            counter = counter + count;
        }
    }
}
System.out.println(counter.toString());
}

which prints the required 7

0
source

Source: https://habr.com/ru/post/1618569/


All Articles