User-260554092 posted
Hi,
I am extracting text using from a pdf, and the encoding seems to not work. I have 2 methods to extract the text from the pdf because for some pdf's method 1 works, and for others, methods 2 works. I want to combine both but don't understand how...
Also for method 2, the encoding gets messed up, ie. whitespaces have ascii code of 63 for some reason, is there a way to fix this, so that I can use indexOf method using a string of a white space and it will match the whitespace in the extracted text.
public static bool does_document_text_have_keyword(string keyword, string pdf_src)
{
try
{
PdfReader pdfReader = new PdfReader(pdf_src);
string currentText;
int count = pdfReader.NumberOfPages;
for (int page = 1; page <= count; page++)
{ // method_1
PdfReader reader = new PdfReader(pdf_src);
currentText = PDFParser.ExtractTextFromPDFBytes(pdfReader.GetPageContent(page)) + " ";
if (currentText.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1) return true;
// method_2
StringWriter output = new StringWriter();
output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, page, new SimpleTextExtractionStrategy()));
currentText = fix_encoding(output.ToString());
if (currentText.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1) return true;
}
pdfReader.Close();
return false;
}
catch
{
return false;
}
}