代码之家  ›  专栏  ›  技术社区  ›  prakharjain

从HTML XML提取值的XSLT

  •  0
  • prakharjain  · 技术社区  · 6 年前

    我有一个XML

    而且,我需要提取生成的表的值。具体来说,每行第2列和第3列的行值。

    HTML看起来像

    table

    XML看起来像:

    <DIV><DIV><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><TR></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"><COL width="122px"><COL width="122px"><COL width="122px"><COL width="122px"></COLGROUP><TR></TR><TR><TD align="LEFT" colspan="5" style="border: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Nutrition</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Typical Values</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">Per 100g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">One tart (125g)</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">%RI*</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">RI*</SPAN></TD></TR></TABLE></TD></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"><COL width="122px"><COL width="122px"><COL width="122px"><COL width="122px"></COLGROUP><TR></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Energy</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1373kJ / 329kcal</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1717kJ / 411kcal</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">20%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">8400kJ / 2000kcal</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Fat</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">20.0g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">25.0g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">36%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">70g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Saturates</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">11.2g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">14.0g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">70%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">20g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Carbohydrate</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">32.9g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">41.1g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">16%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">260g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Sugars</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">16.2g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">20.2g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">22%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">90g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Fibre</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1.3g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">1.6g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">&nbsp;</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">&nbsp;</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Protein</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">3.9g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">4.9g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;"><SPAN style="font-size: inherit;">10%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;"><SPAN style="font-size: inherit;">50g</SPAN></TD></TR><TR><TD colspan="1" style="width: 160px;border-left: 1px solid black;border-bottom: 1px solid black;padding-left: 3px;"><SPAN style="font-size: inherit;">Salt</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">0.1g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">0.1g</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">2%</SPAN></TD><TD align="CENTER" colspan="1" style="width: 122px;border-right: 1px solid black;border-bottom: 1px solid black;"><SPAN style="font-size: inherit;">6g</SPAN></TD></TR></TABLE></TD></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"><COL width="122px"><COL width="122px"><COL width="122px"><COL width="122px"></COLGROUP><TR></TR><TR><TD colspan="5" style="border-left: 1px solid black;border-right: 1px solid black;"><SPAN style="font-size: inherit;">Contains 2 servings</SPAN></TD></TR></TABLE></TD></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><COLGROUP><COL width="160px"></COLGROUP><TR></TR><TR><TD colspan="1"><TABLE border-collapse="collapse" cellspacing="0" style="width: 650px;"><TR></TR><TR><TD colspan="1" style="border-left: 1px solid black;border-right: 1px solid black;border-bottom: 1px solid black;padding-left: 3px;"><P><SPAN>* Reference intake of an average adult (8400 kJ / 2000 kcal)</SPAN></P></TD></TR></TABLE></TD></TR></TABLE></TD></TR></TABLE></DIV></DIV>
    

    我试过什么: 我需要将值存储在XSLT中的变量中。

    <?xml version="1.0" encoding="UTF-8"?>
    <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
        <for-each select="//SPAN">
            <value-of select="." />
        </for-each>
    </xsl:stylesheet>
    

    我将如何获得价值,特别是我想知道:

    能量col2

    能量col3

    价值观。希望它们在变量中。我如何知道特定值是第2列(或第3列)的类型(能量或脂肪等)

    1 回复  |  直到 6 年前
        1
  •  -1
  •   prakharjain    6 年前

    虽然这并不能回答我用regex解析HTML XML时遇到的问题,但它仍然可以解决我的问题。 所以,我从XSLT调用Java函数。

    Java代码:

    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class NutrientValues {
    
    private static final String regex = "Energy.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Fat.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Saturates.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Carbohydrate.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Sugars.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Fibre.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>.*?Protein.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPA    N>.*?Salt.*?<SPAN.*?>(.*?)<\\/SPAN>.*?<SPAN.*?>(.*?)<\\/SPAN>";
        private static final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
        private static Matcher matcher;
        public static boolean process(String htmldoc) {
            matcher = pattern.matcher(htmldoc);
            return matcher.find();
        }
        public static String getEnergyPer100() {
            return matcher.group(1);
        }
        public static String getEnergyPerServ() {
            return matcher.group(2);
        }
        public static String getFatPer100() {
            return matcher.group(3);
        }
        public static String getFatPerServ() {
            return matcher.group(4);
        }
        public static String getSaturatesPer100() {
            return matcher.group(5);
        }
        public static String getSaturatesPerServ() {
            return matcher.group(6);
        }
        public static String getCarbohydratePer100() {
            return matcher.group(7);
        }
        public static String getCarbohydratePerServ() {
            return matcher.group(8);
        }
        public static String getSugarsPer100() {
            return matcher.group(9);
        }
        public static String getSugarsPerServ() {
            return matcher.group(10);
        }
        public static String getFibrePer100() {
            return matcher.group(11);
        }
        public static String getFibrePerServ() {
            return matcher.group(12);
        }
        public static String getProteinPer100() {
            return matcher.group(13);
        }
        public static String getProteinPerServ() {
            return matcher.group(14);
        }
        public static String getSaltPer100() {
            return matcher.group(15);
        }
        public static String getSaltPerServ() {
            return matcher.group(16);
        }
    }
    

    结果:

    Group 1: 1373kJ / 329kcal
    Group 2: 1717kJ / 411kcal
    Group 3: 20.0g
    Group 4: 25.0g
    Group 5: 11.2g
    Group 6: 14.0g
    Group 7: 32.9g
    Group 8: 41.1g
    Group 9: 16.2g
    Group 10: 20.2g
    Group 11: 1.3g
    Group 12: 1.6g
    Group 13: 3.9g
    Group 14: 4.9g
    Group 15: 0.1g
    Group 16: 0.1g