locked
Block or Skip description resume in PDF file. RRS feed

  • Question

  • User-2132497554 posted

    Hi folks,

    I am using the ItextSharp library. Here is my sample code as below:

     private static void PDFReadTest()
            {
                string pdfFilePath = System.IO.Path.Combine(Directory.GetParent(System.IO.Directory.GetCurrentDirectory())
                    .Parent.FullName, "Pdf\\MyFile.pdf");
    
                if (System.IO.File.Exists(pdfFilePath))
                {
                    string currentText = string.Empty;
                    StringBuilder pdfText = new StringBuilder();
    
                    using (PdfReader pdfReader = new PdfReader(pdfFilePath))
                    {
                        for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                        {
                            ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                            currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                            pdfText.Append(currentText);
                        }
                        pdfReader.Close();
                    }
    
                    List<string> lines = new List<string>();
                    lines = pdfText.ToString().Trim().Split('\n').ToList();
                    List<List<string>> listGroup = new List<List<string>>();
    
                    int i = 0;
                    var length = lines.Count;
    
                    while (i < length && lines.Count != 0)
                    {
                        var index = lines.IndexOf(" ");
                        var count = lines.Count();
    
                        if (index != -1)
                        {
                            List<string> cList = lines.GetRange(0, index);
                            lines = lines.GetRange(index + 1, count - index - 1);
                            listGroup.Add(cList);
                            i = i + index - 1;
                        }
                        else
                        {
                            List<string> cList = lines.GetRange(0, count);
                            lines = lines.GetRange(count - 1, 0);
                            listGroup.Add(cList);
                            i = i + count;
                        }
                    }
    
                    List<string> matchedWord = new List<string>();
                    List<List<string>> lineGroup = new List<List<string>>();
                    List<Experience> listExperience = new List<Experience>();
    
                    int x = 0;
                    int properties = 3;
    
                    foreach (List<string> line in listGroup)
                    {
    
                        if (line.Count != 0)
                        {
                            if (line[0].ToUpper().Contains("Experience".ToUpper()))
                            {
                                x++;
                                while (x < line.Count)
                                {
                                    Experience experience = new Experience();
                                    experience.CompanyName = line[x];
                                    experience.JobTitle = line[x + 1];
    
                                    string pattern = @"\d{4}";
                                    var k = Regex.Matches(line[x + 2], pattern);
    
                                    if (k.Count > 0)
                                    {
                                        experience.FromYear = k[0].Value;
                                        experience.ToYear = k[1].Value;
                                    }
                                    listExperience.Add(experience);
                                    x += properties;
                                }
                            }
                        }
                    }
                    foreach (Experience ex in listExperience)
                    {
                        Console.WriteLine(ex.CompanyName);
                        Console.WriteLine(ex.JobTitle);
                        Console.WriteLine(ex.FromYear);
                        Console.WriteLine(ex.ToYear);
    
                        Console.WriteLine();
                        Console.WriteLine("---------------------");
                        Console.WriteLine();
                    }
                }
                Console.ReadLine();
            }

     public class Experience
        {
            public string CompanyName { get; set; }
            public string JobTitle { get; set; }
            public string FromYear { get; set; }
            public string ToYear { get; set; }
        }

    Here is my pdf file:


    I do not want to display some descriptions (Depend on the number of lines or characters) on the console app.  Do you have any idea how to do it if possible?
     
    I am waiting for your response.

    Thanks in advance.


    Monday, November 16, 2020 8:53 AM

Answers

  • User-821857111 posted

    Resume parsing is a very complicated exercise. It is not something that you can do with just string manipulation functions and Regex unless you can guarantee that every resume is structured identically. Commercial solutions use Natural Language Processing (https://en.wikipedia.org/wiki/Natural_language_processing).

    • Marked as answer by Anonymous Thursday, October 7, 2021 12:00 AM
    Monday, November 16, 2020 3:50 PM