Is there any logic to get paragraph text from pdf file using itextsharp? I know that pdf only supports the launch of texts and it is difficult to determine which lines of texts are associated with which paragraph, and I also know that there are no <p> or other tags to define a paragraph in pdf. However, I tried to get the coordinates of the text runs to build a paragraph from its coordinates, but with no luck :(. My piece of code is here:
private StringBuilder result = new StringBuilder(); private Vector lastBaseLine; //to store run of texts public List<string> strings = new List<String>(); //to store run of texts Coordinate (Y coordinate) public List<float> baselines = new List<float>(); public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo) { Vector curBaseline = renderInfo.GetBaseline().GetStartPoint(); if ((this.lastBaseLine != null) && (curBaseline[Vector.I2] != lastBaseLine[Vector.I2])) { if ((!string.IsNullOrEmpty(this.result.ToString()))) { this.baselines.Add(this.lastBaseLine[Vector.I2]); this.strings.Add(this.result.ToString()); } result = new StringBuilder(); } this.result.Append(renderInfo.GetText()); this.lastBaseLine = curBaseline; }
Does any body have any logic related to this problem?
source share