C#解析PDF的方式有很多,比較好用的有ITestSharp和PdfBox。
PDF內容頁如果是圖片類型,例如掃描件,則需要進行OCR(光學字符識別)。
文本內容的PDF文檔,解析的過程中,我目前僅發現能以字符串的形式讀取的,不能夠讀取其中的表格。據說PDF文檔結構中是沒有表格概念的,因此這個自然是讀不到的,如果果真如此,則PDF中表格內容的解析,只能對獲取到的字符串按照一定的邏輯自行解析了。
ITestSharp是一C#開源項目,PdfBox為java開源項目,借助于IKVM在.Net平臺下有實現。
Pdf轉換Image,使用的是GhostScript,可以以API的方式調用,也可以以Windows命令行的方式調用。
OCR使用的是asprise,識別效果較好(商業),另外還可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依賴Office組件),Tessert(HP->Google)(效果很差)。
附上ITestSharp、PdfBox對PDF的解析代碼。
ITestSharp輔助類
1 using System; 2 using System.Collections.Generic; 3 using System.Text; 4 5 using iTextSharp.text.pdf; 6 using iTextSharp.text.pdf.parser; 7 using System.IO; 8 9 namespace eyuan 10 { 11 public static class ITextSharpHandler 12 { 13 /// <summary> 14 /// 讀取PDF文本內容 15 /// </summary> 16 /// <param name="fileName"></param> 17 /// <returns></returns> 18 public static string ReadPdf(string fileName) 19 { 20 if (!File.Exists(fileName)) 21 { 22 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 23 return string.Empty; 24 } 25 // 26 string fileContent = string.Empty; 27 StringBuilder sbFileContent = new StringBuilder(); 28 //打開文件 29 PdfReader reader = null; 30 try 31 { 32 reader = new PdfReader(fileName); 33 } 34 catch (Exception ex) 35 { 36 LogHandler.LogWrite(string.Format(@"加載PDF文件{0}失敗,錯誤:{1}", new string[] { fileName, ex.ToString() })); 37 38 if (reader != null) 39 { 40 reader.Close(); 41 reader = null; 42 } 43 44 return string.Empty; 45 } 46 47 try 48 { 49 //循環各頁(索引從1開始) 50 for (int i = 1; i <= reader.NumberOfPages; i++) 51 { 52 sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i)); 53 54 } 55 56 } 57 catch (Exception ex) 58 { 59 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失敗,錯誤:{1}", new string[] { fileName, ex.ToString() })); 60 61 } 62 finally 63 { 64 if (reader != null) 65 { 66 reader.Close(); 67 reader = null; 68 } 69 } 70 // 71 fileContent = sbFileContent.ToString(); 72 return fileContent; 73 } 74 /// <summary> 75 /// 獲取PDF頁數 76 /// </summary> 77 /// <param name="fileName"></param> 78 /// <returns></returns> 79 public static int GetPdfPageCount(string fileName) 80 { 81 if (!File.Exists(fileName)) 82 { 83 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName); 84 return -1; 85 } 86 //打開文件 87 PdfReader reader = null; 88 try 89 { 90 reader = new PdfReader(fileName); 91 } 92 catch (Exception ex) 93 { 94 LogHandler.LogWrite(string.Format(@"加載PDF文件{0}失敗,錯誤:{1}", new string[] { fileName, ex.ToString() })); 95 96 if (reader != null) 97 { 98 reader.Close(); 99 reader = null;100 }101 102 return -1;103 }104 //105 return reader.NumberOfPages;106 }107 }108 }PDFBox輔助類
1 using org.pdfbox.pdmodel; 2 using org.pdfbox.util; 3 using System; 4 using System.Collections.Generic; 5 using System.IO; 6 using System.Text; 7 8 namespace eyuan 9 {10 public static class PdfBoxHandler11 {12 /// <summary>13 /// 使用PDFBox組件進行解析14 /// </summary>15 /// <param name="input">PDF文件路徑</param>16 /// <returns>PDF文本內容</returns>17 public static string ReadPdf(string input)18 {19 if (!File.Exists(input))20 {21 LogHandler.LogWrite(@"指定的PDF文件不存在:" + input);22 return null;23 }24 else25 {26 PDDocument pdfdoc = null;27 string strPDFText = null;28 PDFTextStripper stripper = null;29 30 try31 {32 //加載PDF文件33 pdfdoc = PDDocument.load(input);34 }35 catch (Exception ex)36 {37 LogHandler.LogWrite(string.Format(@"加載PDF文件{0}失敗,錯誤:{1}", new string[] { input, ex.ToString() }));38 39 if (pdfdoc != null)40 {41 pdfdoc.close();42 pdfdoc = null;43 }44 45 return null;46 }47 48 try49 {50 //解析PDF文件51 stripper = new PDFTextStripper();52 strPDFText = stripper.getText(pdfdoc);53 54 55 56 }57 catch (Exception ex)58 {59 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失敗,錯誤:{1}", new string[] { input, ex.ToString() }));60 61 }62 finally63 {64 if (pdfdoc != null)65 {66 pdfdoc.close();67 pdfdoc = null;68 }69 }70 71 return strPDFText;72 }73 74 }75 }76 }另外附上PDF轉Image,然后對Image進行OCR的代碼。
轉換PDF為Jpeg圖片代碼(GhostScript輔助類)
1 using System; 2 using System.Collections; 3 using System.Collections.Generic; 4 using System.Runtime.InteropServices; 5 using System.Text; 6 7 namespace eyuan 8 { 9 public class GhostscriptHandler 10 { 11 12 #region GhostScript Import 13 /// <summary>創建Ghostscript的實例 14 /// This instance is passed to most other gsapi functions. 15 /// The caller_handle will be PRovided to callback functions. 16 /// At this stage, Ghostscript supports only one instance. </summary> 17 /// <param name="pinstance"></param> 18 /// <param name="caller_handle"></param> 19 /// <returns></returns> 20 [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")] 21 private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle); 22 /// <summary>This is the important function that will perform the conversion 23 /// 24 /// </summary> 25 /// <param name="instance"></param> 26 /// <param name="argc"></param> 27 /// <param name="argv"></param> 28 /// <returns></returns> 29 [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")] 30 private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv); 31 /// <summary> 32 /// Exit the interpreter. 33 /// This must be called on shutdown if gsapi_init_with_args() has been called, 34 /// and just before gsapi_delete_instance(). 35 /// 退出 36 /// </summary> 37 /// <param name="instance"></param> 38 /// <returns></returns> 39 [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")] 40 private static extern int gsapi_exit(IntPtr instance); 41 /// <summary> 42 /// Destroy an instance
新聞熱點
疑難解答