locked
Extract Email addresses from HTML RRS feed

  • Question

  • Hello folks,

    In my code I want to download a website html page and extract all of the email address from it.

    How can I do it using C#?

    Yair

    Thursday, December 31, 2009 5:36 AM

Answers

  • using System.Net;
            static void Main(string[] args)
            {
                Console.Write("\nPlease enter a URI (for example, http://www.sample.com):");
                string remoteUri = Console.ReadLine();
                WebClient myWebClient = new WebClient();
                Console.WriteLine("Downloading " + remoteUri);
                byte[] myDataBuffer = myWebClient.DownloadData(remoteUri);
                string download = Encoding.ASCII.GetString(myDataBuffer);
                Console.WriteLine("Download successful.");
                String[] Emails = GetEmailsFromWebContent(download);
                foreach (String Email in Emails)
                {
                    Console.WriteLine(Email);
                }
    
            }
    
            private static string[] GetEmailsFromWebContent(string webcontent)
            {
                MatchCollection coll = default(MatchCollection);
                int i = 0;
                coll = Regex.Matches(webcontent, "([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})");
                string[] results = new string[coll.Count];
                for (i = 0; i <= results.Length - 1; i++)
                {
                    results[i] = coll[i].Value;
                }
    
                return results;
            }
    

    Happy Coding, RDRaja
    • Proposed as answer by Dharmalinga Raja Thursday, December 31, 2009 11:03 AM
    • Marked as answer by Chao Kuo Wednesday, January 6, 2010 5:42 AM
    Thursday, December 31, 2009 6:07 AM
  •         List<string> emails = new List<string>();
            String webPageUrl = "http://www.google.co.uk/";
    
            public String ReadWebPage()
            {
                WebClient Client = new WebClient();
                Stream strm = Client.OpenRead(webPageUrl);
                StreamReader sr = new StreamReader(strm);
                string htmlFile;
                htmlFile = sr.ReadToEnd();
                strm.Close();
    
                return htmlFile;
            }
    
            public void extractEmail(String htmlDoc)
            {
                Regex exp = new Regex("\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\\b",RegexOptions.IgnoreCase);
                MatchCollection matchCollection = exp.Matches(htmlDoc); 
                foreach(Match m in matchCollection)
                {
                    if(!emails.Contains(m.Value))
                        emails.Add(m.Value);
                }
    
    
            }
    • Proposed as answer by Chao Kuo Monday, January 4, 2010 10:09 AM
    • Marked as answer by Chao Kuo Wednesday, January 6, 2010 5:43 AM
    Thursday, December 31, 2009 6:33 AM

All replies

  • using System.Net;
            static void Main(string[] args)
            {
                Console.Write("\nPlease enter a URI (for example, http://www.sample.com):");
                string remoteUri = Console.ReadLine();
                WebClient myWebClient = new WebClient();
                Console.WriteLine("Downloading " + remoteUri);
                byte[] myDataBuffer = myWebClient.DownloadData(remoteUri);
                string download = Encoding.ASCII.GetString(myDataBuffer);
                Console.WriteLine("Download successful.");
                String[] Emails = GetEmailsFromWebContent(download);
                foreach (String Email in Emails)
                {
                    Console.WriteLine(Email);
                }
    
            }
    
            private static string[] GetEmailsFromWebContent(string webcontent)
            {
                MatchCollection coll = default(MatchCollection);
                int i = 0;
                coll = Regex.Matches(webcontent, "([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})");
                string[] results = new string[coll.Count];
                for (i = 0; i <= results.Length - 1; i++)
                {
                    results[i] = coll[i].Value;
                }
    
                return results;
            }
    

    Happy Coding, RDRaja
    • Proposed as answer by Dharmalinga Raja Thursday, December 31, 2009 11:03 AM
    • Marked as answer by Chao Kuo Wednesday, January 6, 2010 5:42 AM
    Thursday, December 31, 2009 6:07 AM
  • List<string> ExtractEMails(string htmlFileName)
    {
                List<string> ret=new List<string>();
                using (StreamReader sr = new StreamReader(htmlFileName)) 
                {
                   string s=sr.ReadToEnd();
                   int i=s.IndexOf("@");
                   while(i>0)
                   {
                       string s1="";
                       int j=i;
                       while(j>=0&&' '!=s[j])
                             s1=s[j--].ToString()+s1;
                       s1+=s.Substring(i+1, s.IndexOf(' ', i)-i);
                       ret.Add(s1);
                       i=s.IndexOf("@", i+1);
                   }
                }
                return ret;
    }

    With best regards, Yasser Zamani
    Thursday, December 31, 2009 6:16 AM
  •         List<string> emails = new List<string>();
            String webPageUrl = "http://www.google.co.uk/";
    
            public String ReadWebPage()
            {
                WebClient Client = new WebClient();
                Stream strm = Client.OpenRead(webPageUrl);
                StreamReader sr = new StreamReader(strm);
                string htmlFile;
                htmlFile = sr.ReadToEnd();
                strm.Close();
    
                return htmlFile;
            }
    
            public void extractEmail(String htmlDoc)
            {
                Regex exp = new Regex("\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\\b",RegexOptions.IgnoreCase);
                MatchCollection matchCollection = exp.Matches(htmlDoc); 
                foreach(Match m in matchCollection)
                {
                    if(!emails.Contains(m.Value))
                        emails.Add(m.Value);
                }
    
    
            }
    • Proposed as answer by Chao Kuo Monday, January 4, 2010 10:09 AM
    • Marked as answer by Chao Kuo Wednesday, January 6, 2010 5:43 AM
    Thursday, December 31, 2009 6:33 AM
  • Thank you all for the very informative examples.

    I need the connection code to suppot proxy connections - Do you know how to overcome this obstacle?

    Currently, i'm getting: "The underlying connection was closed: Unable to connect to the remote server." probably because of the proxy connection in our company.

    Yair
    Thursday, December 31, 2009 9:48 AM
  • If Proxy is Blocking  your access  , then you need to request your administrator for an access.



    Happy Coding, RDRaja
    Thursday, December 31, 2009 9:54 AM
  • I do have access. in the code, should I add or change something regarding it?
    Thursday, December 31, 2009 9:59 AM
  • Thank you all for the very informative examples.

    I need the connection code to suppot proxy connections - Do you know how to overcome this obstacle?

    Currently, i'm getting: "The underlying connection was closed: Unable to connect to the remote server." probably because of the proxy connection in our company.

    Yair
    WebClient class has proxy property.
    Thursday, December 31, 2009 10:04 AM
  • Are you getting any errors or not getting defined output
    Happy Coding, RDRaja
    Thursday, December 31, 2009 10:04 AM
  • Exception: "The underlying connection was closed: Unable to connect to the remote server."
    Thursday, December 31, 2009 10:16 AM
  • Thank you all for the very informative examples.

    I need the connection code to suppot proxy connections - Do you know how to overcome this obstacle?

    Currently, i'm getting: "The underlying connection was closed: Unable to connect to the remote server." probably because of the proxy connection in our company.

    Yair
    WebClient class has proxy property.

    Hi Peter Crab,

    Can I use the proxy settings in my IE browser in the code?

    Yair
    Thursday, December 31, 2009 10:17 AM
  • Thank you all for the very informative examples.

    I need the connection code to suppot proxy connections - Do you know how to overcome this obstacle?

    Currently, i'm getting: "The underlying connection was closed: Unable to connect to the remote server." probably because of the proxy connection in our company.

    Yair
    WebClient class has proxy property.

    Hi Peter Crab,

    Can I use the proxy settings in my IE browser in the code?

    Yair

    you sure can
    Thursday, December 31, 2009 10:28 AM
  • How?
    Thursday, December 31, 2009 11:03 AM
  • Hello, Yair
    As far as I invested, this is not a proxy issue, for the proxy is set by the system using configuration files and the Internet Explorer Local Area Network settings. So you don't need to set the proxy property manually, I think there maybe some other thing that blocks your request. Do you have firewall installed, the firewall could block your request and cause such exception.
    Thanks
    Chao
    Monday, January 4, 2010 10:41 AM