代码之家 › 专栏 › 技术社区 › emsimpson92

用regex标识标题

pdf parsing regex c#

-1

emsimpson92 · 技术社区 · 6 年前

我想知道如何用一个或多个正则表达式来识别具有不同数字标记样式的标题,假设有时文档之间的样式重叠。目标是提取每个文件中特定标题的所有子标题和数据,但这些文件没有标准化。正则表达式在这里是否是正确的方法?

我正在开发一个程序,它解析.pdf文件并查找特定的部分。一旦找到该节,它就会找到该节的所有子节及其内容,并将其存储在 字典中<string,string> 。我首先将整个PDF读取到一个字符串中,然后使用此函数定位“marking”部分。

private string getmarkingselection(字符串文本)
{
int startindex=0;
int endindex=0;
bool startindexfound=false;
regex rx=新regex(headingreex);
foreach(在rx.matches(文本)中匹配匹配)
{
if(找到startindex)
{
endindex=match.index;














dictionary<string, string>

private string GetMarkingSection(string text)
    {
      int startIndex = 0;
      int endIndex = 0;
      bool startIndexFound = false;
      Regex rx = new Regex(HEADINGREGEX);
      foreach (Match match in rx.Matches(text))
      {
        if (startIndexFound)
        {
          endIndex = match.Index;
          break;
        }
        if (match.ToString().ToLower().Contains("marking"))
        {
          startIndex = match.Index;
          startIndexFound = true;
        }
      }
      return text.Substring(startIndex, (endIndex - startIndex));
    }




private Dictionary<string, string> GetSubsections(string text)
    {
      Dictionary<string, string> subsections = new Dictionary<string, string>();
      string[] unprocessedSubSecs = Regex.Split(text, SUBSECTIONREGEX);
      string title = "";
      string content = "";
      foreach(string s in unprocessedSubSecs)
      {
        if(s != "") //sometimes it pulls in empty strings
        {
          Match m = Regex.Match(s, SUBSECTIONREGEX);
          if (m.Success)
          {
            title = s;
          }
          else
          {
            content = s;
            if (!String.IsNullOrWhiteSpace(content) && !String.IsNullOrWhiteSpace(title))
            {
              subsections.Add(title, content);
            }
          }
        }
      }
      return subsections;
    }


Here is a link to some sample headings and subheadings from various documents.


(?m)^(\d+\.\d+\s[ \w,\-]+)\r?$
(?m)^(\d\.[\d.]+ ?[ \w]+) ?\r?$
(?m)^(\d\.?[\d.]*? ?[ \-,:\w]+) ?\r?$










GetMarkingSection

private Dictionary<string, string> GetMarkingSection(string text)
    {
      var headingRegex = HEADING1REGEX;
      var subheadingRegex = HEADING2REGEX;
      Dictionary<string, string> markingSection = new Dictionary<string, string>();

      if (Regex.Matches(text, HEADING1REGEX, RegexOptions.Multiline | RegexOptions.Singleline).Count > 0)
      {
        foreach (Match m in Regex.Matches(text, headingRegex, RegexOptions.Multiline | RegexOptions.Singleline))
        {
          if (Regex.IsMatch(m.ToString(), HEADINGMASTERKEY))
          {
            if (m.Groups[2].Value.ToLower().Contains("marking"))
            {
              var subheadings = Regex.Matches(m.ToString(), subheadingRegex, RegexOptions.Multiline | RegexOptions.Singleline);
              foreach (Match s in subheadings)
              {
                markingSection.Add(s.Groups[1].Value + " " + s.Groups[2].Value, s.Groups[3].Value);
              }
              return markingSection;
            }
          }
        }
      }
      else
      {
        headingRegex = HEADING2REGEX;
        subheadingRegex = HEADING3REGEX;

        foreach(Match m in Regex.Matches(text, headingRegex, RegexOptions.Multiline | RegexOptions.Singleline))
        {
          if(Regex.IsMatch(m.ToString(), HEADINGMASTERKEY))
          {
            if (m.Groups[2].Value.ToLower().Contains("marking"))
            {
              var subheadings = Regex.Matches(m.ToString(), subheadingRegex, RegexOptions.Multiline | RegexOptions.Singleline);
              foreach (Match s in subheadings)
              {
                markingSection.Add(s.Groups[1].Value + " " + s.Groups[2].Value, s.Groups[3].Value);
              }
              return markingSection;
            }
          }
        }
      }
      return null;
    }

1 回复 | 直到 6 年前

Matt.G 6 年前

var heading1Regex = @"^(\d+)\s(?<title>.*?)$\n(?<content>.*?)$\n*(?=^\d+\s|\Z)";

Demo

var heading2Regex = @"^(\d+)\.(\d+)\s(?<title>.*?)$\n(?<content>.*?)$\n*(?=^\d+\.\d+\s|\Z)";

Demo

var heading3Regex = @"^(\d+)\.(\d+)\.(\d+)\s(?<title>.*?)$\n(?<content>.*?)$\n*(?=^\d+\.\d+\.\d+\s|\Z)";

Demo

var headingRegex = heading1Regex;
var subHeadingRegex = heading2Regex;

if there are any matches for headingRegex
{
    for each match, find matches for subHeadingRegex
}
else
{
    var headingRegex = heading2Regex;
    var subHeadingRegex = heading3Regex;
    //repeat same steps
}

here

int.TryParse(match.group1, out var headingIndex);

int.TryParse(match.group1, out var subHeadingIndex);