- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.IO;
- using System.Text.RegularExpressions;
- using org.pdfbox.pdmodel;
- using org.pdfbox.util;
- using Microsoft.Office.Interop.Word;
- namespace TestPdfReader
- {
- class Program
- {
- static void Main(string[] args)
- {
- PDDocument doc = PDDocument.load(@"C:\\resume.pdf");
- PDFTextStripper pdfStripper = new PDFTextStripper();
- string text = pdfStripper.getText(doc);
- string result = text.Replace('\\t', ' ').Replace('\\n', ' ').Replace('\\r', ' ').Replace(" ", "");
- Console.WriteLine(result);
- //Doc,Docx
- object docPath = @"C:\\resume.doc";
- object docxPath = @"C:\\resume.docx";
- object missing=System.Reflection.Missing.Value;
- object readOnly=true;
- Application wordApp;
- wordApp = new Application();
- Document wordDoc = wordApp.Documents.Open(ref docPath,
- ref missing,
- ref readOnly,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing,
- ref missing);
- string text2 = FilterString(wordDoc.Content.Text);
- wordDoc.Close(ref missing, ref missing, ref missing);
- wordApp.Quit(ref missing, ref missing, ref missing);
- Console.WriteLine(text2);
- Console.Read();
- }
- private static string FilterString(string input)
- {
- return Regex.Replace(input, @"(\\a|\\t|\\n|\\s+)", "");
- }
- }
- }
- //该片段来自于http://www.codesnippet.cn/detail/240620134251.html
来源: http://www.codesnippet.cn/detail/240620134251.html