C#解析PDF

2019-11-17 03:07:42

字體：大中小

來源：轉載

供稿：網友

C#解析PDF

C#解析PDF的方式有很多，比較好用的有ITestSharp和PdfBox。

PDF內容頁如果是圖片類型，例如掃描件，則需要進行OCR（光學字符識別）。

文本內容的PDF文檔，解析的過程中，我目前僅發現能以字符串的形式讀取的，不能夠讀取其中的表格。據說PDF文檔結構中是沒有表格概念的，因此這個自然是讀不到的，如果果真如此，則PDF中表格內容的解析，只能對獲取到的字符串按照一定的邏輯自行解析了。

ITestSharp是一C#開源項目，PdfBox為java開源項目，借助于IKVM在.Net平臺下有實現。

Pdf轉換Image，使用的是GhostScript，可以以API的方式調用，也可以以Windows命令行的方式調用。

OCR使用的是asprise，識別效果較好（商業），另外還可以使用MS的ImageScaning（2007）或OneNote（2010）（需要依賴Office組件），Tessert（HP->Google）（效果很差）。

附上ITestSharp、PdfBox對PDF的解析代碼。

ITestSharp輔助類

  1 using System;  2 using System.Collections.Generic;  3 using System.Text;  4   5 using iTextSharp.text.pdf;  6 using iTextSharp.text.pdf.parser;  7 using System.IO;  8   9 namespace eyuan 10 { 11     public static class ITextSharpHandler 12     { 13         /// <summary> 14         /// 讀取PDF文本內容 15         /// </summary> 16         /// <param name="fileName"></param> 17         /// <returns></returns> 18         public static string ReadPdf(string fileName) 19         { 20             if (!File.Exists(fileName)) 21             { 22                 LogHandler.LogWrite(@"指定的PDF文件不存在：" + fileName); 23                 return string.Empty; 24             } 25             // 26             string fileContent = string.Empty; 27             StringBuilder sbFileContent = new StringBuilder(); 28             //打開文件 29             PdfReader reader = null; 30             try 31             { 32                 reader = new PdfReader(fileName); 33             } 34             catch (Exception ex) 35             { 36                 LogHandler.LogWrite(string.Format(@"加載PDF文件{0}失敗,錯誤:{1}", new string[] { fileName, ex.ToString() })); 37  38                 if (reader != null) 39                 { 40                     reader.Close(); 41                     reader = null; 42                 } 43  44                 return string.Empty; 45             } 46  47             try 48             { 49                 //循環各頁（索引從1開始） 50                 for (int i = 1; i <= reader.NumberOfPages; i++) 51                 { 52                     sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i)); 53  54                 } 55  56             } 57             catch (Exception ex) 58             { 59                 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失敗,錯誤:{1}", new string[] { fileName, ex.ToString() })); 60  61             } 62             finally 63             { 64                 if (reader != null) 65                 { 66                     reader.Close(); 67                     reader = null; 68                 } 69             } 70             // 71             fileContent = sbFileContent.ToString(); 72             return fileContent; 73         } 74         /// <summary> 75         /// 獲取PDF頁數 76         /// </summary> 77         /// <param name="fileName"></param> 78         /// <returns></returns> 79         public static int GetPdfPageCount(string fileName) 80         { 81             if (!File.Exists(fileName)) 82             { 83                 LogHandler.LogWrite(@"指定的PDF文件不存在：" + fileName); 84                 return -1; 85             } 86             //打開文件 87             PdfReader reader = null; 88             try 89             { 90                 reader = new PdfReader(fileName); 91             } 92             catch (Exception ex) 93             { 94                 LogHandler.LogWrite(string.Format(@"加載PDF文件{0}失敗,錯誤:{1}", new string[] { fileName, ex.ToString() })); 95  96                 if (reader != null) 97                 { 98                     reader.Close(); 99                     reader = null;100                 }101 102                 return -1;103             }104             //105             return reader.NumberOfPages;106         }107     }108 }

PDFBox輔助類

 1 using org.pdfbox.pdmodel; 2 using org.pdfbox.util; 3 using System; 4 using System.Collections.Generic; 5 using System.IO; 6 using System.Text; 7  8 namespace eyuan 9 {10     public static class PdfBoxHandler11     {12         /// <summary>13         /// 使用PDFBox組件進行解析14         /// </summary>15         /// <param name="input">PDF文件路徑</param>16         /// <returns>PDF文本內容</returns>17         public static string ReadPdf(string input)18         {19             if (!File.Exists(input))20             {21                 LogHandler.LogWrite(@"指定的PDF文件不存在：" + input);22                 return null;23             }24             else25             {26                 PDDocument pdfdoc = null;27                 string strPDFText = null;28                 PDFTextStripper stripper = null;29 30                 try31                 {32                     //加載PDF文件33                     pdfdoc = PDDocument.load(input);34                 }35                 catch (Exception ex)36                 {37                     LogHandler.LogWrite(string.Format(@"加載PDF文件{0}失敗,錯誤:{1}", new string[] { input, ex.ToString() }));38 39                     if (pdfdoc != null)40                     {41                         pdfdoc.close();42                         pdfdoc = null;43                     }44 45                     return null;46                 }47 48                 try49                 {50                     //解析PDF文件51                     stripper = new PDFTextStripper();52                     strPDFText = stripper.getText(pdfdoc);53 54                    55 56                 }57                 catch (Exception ex)58                 {59                     LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失敗,錯誤:{1}", new string[] { input, ex.ToString() }));60 61                 }62                 finally63                 {64                     if (pdfdoc != null)65                     {66                         pdfdoc.close();67                         pdfdoc = null;68                     }69                 }70 71                 return strPDFText;72             }73 74         }75     }76 }

另外附上PDF轉Image，然后對Image進行OCR的代碼。

轉換PDF為Jpeg圖片代碼（GhostScript輔助類）

  1 using System;  2 using System.Collections;  3 using System.Collections.Generic;  4 using System.Runtime.InteropServices;  5 using System.Text;  6   7 namespace eyuan  8 {  9     public class GhostscriptHandler 10     { 11  12         #region GhostScript Import 13         /// <summary>創建Ghostscript的實例 14         /// This instance is passed to most other gsapi functions.  15         /// The caller_handle will be PRovided to callback functions.   16         ///  At this stage, Ghostscript supports only one instance. </summary>   17         /// <param name="pinstance"></param>   18         /// <param name="caller_handle"></param>   19         /// <returns></returns>    20         [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")] 21         private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle); 22         /// <summary>This is the important function that will perform the conversion 23         ///  24         /// </summary>   25         /// <param name="instance"></param>   26         /// <param name="argc"></param>   27         /// <param name="argv"></param>   28         /// <returns></returns>   29         [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")] 30         private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv); 31         /// <summary>   32         /// Exit the interpreter.  33         /// This must be called on shutdown if gsapi_init_with_args() has been called,  34         /// and just before gsapi_delete_instance(). 35         /// 退出 36         /// </summary>   37         /// <param name="instance"></param>   38         /// <returns></returns>   39         [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")] 40         private static extern int gsapi_exit(IntPtr instance); 41         /// <summary>   42         /// Destroy an instance

上一篇：高性能網站架構設計之緩存篇（1）- Redis的安裝與使用

下一篇：【Unity3D基礎教程】給初學者看的Unity教程（二）：所有腳本組件的基類 -- MonoBehaviour的前世今生