昨天研究一天的對pdf關鍵字定位,走了不少彎路,網(wǎng)上找的好多有瑕疵,今天有時間跟大家分享下:
以下方法是對pdf每頁內(nèi)容進行掃描定位出關鍵字的大致坐標。
import com.google.common.collect.Lists;import com.itextpdf.text.DocumentException;import com.itextpdf.text.pdf.AcroFields;import com.itextpdf.text.pdf.AcroFields.FieldPosition;import com.itextpdf.text.pdf.PdfReader;import com.itextpdf.text.pdf.PdfStamper;import com.itextpdf.text.pdf.parser.ImageRenderInfo;import com.itextpdf.text.pdf.parser.PdfReaderContentParser;import com.itextpdf.text.pdf.parser.RenderListener;import com.itextpdf.text.pdf.parser.TextRenderInfo;
// 定義返回頁碼 PRivate static int i = 0; private static com.itextpdf.awt.geom.Rectangle2D.Float boundingRectange =null; private static StringBuilder content; private static List<Object[]> arrays = Lists.newArrayList();
private static List<Object[]> getKeyWords(String filePath, final String keyWord) {
try { PdfReader pdfReader = new PdfReader(filePath); int pageNum = pdfReader.getNumberOfPages(); PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser(pdfReader); for (i = 1; i < pageNum; i++) { content = new StringBuilder(); boundingRectange =new com.itextpdf.awt.geom.Rectangle2D.Float(); pdfReaderContentParser.processContent(i, new RenderListener() { @Override public void renderText(TextRenderInfo textRenderInfo) { String text = textRenderInfo.getText(); // 整頁內(nèi)容 content.append(text); boundingRectange= textRenderInfo.getBaseline().getBoundingRectange(); /*if (null != text && StringUtils.contains(content, keyWord)) { float[] resu = new float[3]; resu[0] = boundingRectange.x; resu[1] = boundingRectange.y; resu[2] = i; arrays.add(resu); }*/ } @Override public void renderImage(ImageRenderInfo arg0) { // TODO Auto-generated method stub } @Override public void endTextBlock() { // TODO Auto-generated method stub } @Override public void beginTextBlock() { // TODO Auto-generated method stub } }); if (null != content && StringUtils.contains(content, keyWord)) { Object[] resu = new Object[4]; resu[0] = content; resu[1] = boundingRectange.x; resu[2] = boundingRectange.y; resu[3] = i; arrays.add(resu); } // System.out.println("第"+i+"頁,內(nèi)容:"+content); } } catch (IOException e) { e.printStackTrace(); } return arrays; }以上方法中使用到的jar包
itextpdf-5.5.6.jar
新聞熱點
疑難解答