How to get all URLs of the page
By K S
How to get all URLs of the page
Topic : How to get all URLs of the page
namespace PubishApps
{
class FetchURLsFromSite
{
/************************************************
* Topic : How to fetch all URLs of the site.
* Author : kalit sikka
* For : http://eggheadcafe.com
* **********************************************/
/// <summary>
/// To fetch all URLs name from the site
/// </summary>
/// <param name="webPage"></param>
public void FetchUrls( string webPage )
{
GetAllUrls(GetContent(webPage));
}
/// <summary>
/// Get the content of the web page
/// </summary>
/// <param name="webPage"></param>
/// <returns></returns>
private string GetContent(string webPage)
{
HttpWebResponse response = null;//used to get response
StreamReader
respStream = null;//used to read response into string
try
{
//create a request object using the url passed in
HttpWebRequest
request = (HttpWebRequest)WebRequest.Create(webPage);
request.Timeout
= 10000;
//go get a response from the page
response
= (HttpWebResponse)request.GetResponse();
//create a streamreader object from the response
respStream
= new StreamReader(response.GetResponseStream());
//get the contents of the page as a string and return it
return respStream.ReadToEnd();
}
catch (Exception ex)
{
throw ex;
}
finally
{
//close it down, we're going home!
response.Close();
respStream.Close();
}
}
/// <summary>
/// Use regular expression to filter required URLs
/// </summary>
/// <param name="content"></param>
private void GetAllUrls(string content)
{
// Address of local LogFile
string LocalFile = @"C:\Documents and Settings\kalit.20413\My Documents\LogFile.txt";
//regular expression
string pattern = @"(?:href\s*=)(?:[\s""']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?
.*?)(?:[\s>""'])";
//Set up regex object
Regex RegExpr
= new Regex(pattern, RegexOptions.IgnoreCase);
//get the first match
Match match =
RegExpr.Match(content);
//loop through matches
while (match.Success)
{
//output the match info
Console.WriteLine("href match: " + match.Groups[0].Value);
WriteToLog(LocalFile,
"href match: " + match.Groups[0].Value + "\r\n");
Console.WriteLine("Url match: " + match.Groups[1].Value);
//get next match
match
= match.NextMatch();
}
}
/// <summary>
/// Write log at local machine
/// </summary>
/// <param name="file"></param>
/// <param name="message"></param>
private void WriteToLog(string file, string message)
{
using (StreamWriter w = File.AppendText(file))
{
w.WriteLine(DateTime.Now.ToString()
+ ": " + message);
w.Close();
}
}
}
}
Popularity (536 Views)
Article Discussion: How to get all URLs of the page
K S posted at Monday, January 12, 2009 6:38 AM