I have done this project many times.
What you need to do:
1.) Check out this project Extract text from PDF in C # . The project uses ITextSharp.
- It would be better if you upload a sample project and see how it works. This project shows how to extract data from pdf. Look at the PDFParser class, it has a function called ExtractTextFromPDFBytes (byte [] input) , from which you can see how the text is extracted from an uncompressed pdf file. Remember to include the ITextSharp DLL.
PDFParser Class
1 using System;
2 using System.IO;
3 using iTextSharp.text.pdf;
4
5 namespace PdfToText
6 {
7 ///
8 /// Parses a PDF file and extracts the text from it.
9 ///
10 public class PDFParser
eleven {
12 /// BT = Beginning of a text object operator
13 /// ET = End of a text object operator
14 /// Td move to the start of next line
15 /// 5 Ts = superscript
16 /// -5 Ts = subscript
17
18 #region fields
19
20 #region _numberOfCharsToKeep
21 ///
22 /// The number of characters to keep, when extracting text.
23 ///
24 private static int _numberOfCharsToKeep = 15;
25 #endregion
26
27 #endregion
28
29 #region ExtractText
thirty ///
31 /// Extracts a text from a PDF file.
32 ///
33 /// the full path to the pdf file.
34 /// the output file name.
35 /// the extracted text
36 public bool ExtractText (string inFileName, string outFileName)
37 {
38 StreamWriter outFile = null;
39 try
40 {
41 // Create a reader for the given PDF file
42 PdfReader reader = new PdfReader (inFileName);
43 // outFile = File.CreateText (outFileName);
44 outFile = new StreamWriter (outFileName, false, System.Text.Encoding.UTF8);
45
46 Console.Write ("Processing:");
47
48 int totalLen = 68;
49 float charUnit = ((float) totalLen) / (float) reader.NumberOfPages;
50 int totalWritten = 0;
51 float curUnit = 0;
52
53 for (int page = 1; page = 1.0f)
59 {
60 for (int i = 0; i = 1.0f)
70 {
71 for (int i = 0; i
104 /// This method processes an uncompressed Adobe (text) object
105 /// and extracts text.
106 ///
107 /// uncompressed
108 ///
109 private string ExtractTextFromPDFBytes (byte [] input)
110 {
111 if (input == null || input.Length == 0) return "";
112
113 try
114 {
115 string resultString = "";
116
117 // Flag showing if we are we currently inside inside a text object
118 bool inTextObject = false;
119
120 // Flag showing if the next character is literal
121 // eg '\\' to get a '\' character or '\ (' to get '('
122 bool nextLiteral = false;
123
124 // () Bracket nesting level. Text appears inside ()
125 int bracketDepth = 0;
126
127 // Keep previous chars to get extract numbers etc .:
128 char [] previousCharacters = new char [_numberOfCharsToKeep];
129 for (int j = 0; j = '') && (c = 128) && (c
235 /// Check if a certain 2 character token just came along (eg BT)
236 ///
237 /// the searched token
238 /// the recent character array
239 ///
240 private bool CheckToken (string [] tokens, char [] recent)
241 {
242 foreach (string token in tokens)
243 {
244 if ((recent [_numberOfCharsToKeep - 3] == token [0]) &&
245 (recent [_numberOfCharsToKeep - 2] == token [1]) &&
246 ((recent [_numberOfCharsToKeep - 1] == '') ||
247 (recent [_numberOfCharsToKeep - 1] == 0x0d) ||
248 (recent [_numberOfCharsToKeep - 1] == 0x0a)) &&
249 ((recent [_numberOfCharsToKeep - 4] == '') ||
250 (recent [_numberOfCharsToKeep - 4] == 0x0d) ||
251 (recent [_numberOfCharsToKeep - 4] == 0x0a))
252)
253 {
254 return true;
255}
256}
257 return false;
258}
259 #endregion
260}
261}
2.) Parse the extracted text and create an xml file.
Some of my problems are primarily a pdf file that contains broken links or URLs inside the pages. Now, if you are also concerned about this problem, regex can easily solve your problem, but I suggest you work with it later.
Now here is a sample code on how to create xml. Understand how the code works, so you will learn later how to deal with your own code.
try {
// XmlDataDocument sourceXML = new XmlDataDocument ();
string xmlFile = Server.MapPath ("DVDlist.xml");
// create a XML file is not exist
System.Xml.XmlTextWriter writer = new System.Xml.XmlTextWriter (xmlFile, null);
// starts a new document
writer.WriteStartDocument ();
// write comments
writer.WriteComment ("Commentss: XmlWriter Test Program");
writer.Formatting = Formatting.Indented;
writer.WriteStartElement ("DVDlist");
writer.WriteStartElement ("DVD");
writer.WriteAttributeString ("ID", "1 âŗ);
// write some simple elements
writer.WriteElementString ("Title", "Tere Naam");
writer.WriteStartElement ("Starring");
writer.WriteElementString ("Actor", "Salman Khan");
writer.WriteEndElement ();
writer.WriteEndElement ();
writer.WriteEndElement ();
writer.Close ();
}
catch (Exception e1) {
Page.Response.Write (e1);
}
Hope this helps :)
source share