自动获取网页内容的方法
.Net自动获取网页内容有以下三种方法:WebClient、WebBrowser、HttpWebRequest/HttpWebResponse。
1.WebClient
WebClient 类提供向 URI 标识的任何本地、Intranet 或 Internet 资源发送数据以及从这些资源接收数据的公共方法。
WebClient 类使用 WebRequest 类提供对资源的访问。WebClient 实例可以通过任何已向 WebRequest.RegisterPrefix 方法注册的 WebRequest 子代访问数据。
默认情况下,WebClient 实例不发送可选的 HTTP 报头。如果您的请求需要可选报头,必须将该报头添加到 Headers 集合。例如,要在响应中保留查询,必须添加用户代理报头。此外,如果用户代理标头丢失,服务器可能返回 500(内部服务器错误)。
public static void Main (string[] args)
{
if (args == null || args.Length == 0)
{
throw new ApplicationException ("Specify the URI of the resource to retrieve.");
}
WebClient client = new WebClient ();
// Add a user agent header in case the
// requested URI contains a query.
client.Headers.Add ("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
Stream data = client.OpenRead (args[0]);
StreamReader reader = new StreamReader (data);
string s = reader.ReadToEnd ();
Console.WriteLine (s);
data.Close ();
reader.Close ();
}
{
if (args == null || args.Length == 0)
{
throw new ApplicationException ("Specify the URI of the resource to retrieve.");
}
WebClient client = new WebClient ();
// Add a user agent header in case the
// requested URI contains a query.
client.Headers.Add ("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
Stream data = client.OpenRead (args[0]);
StreamReader reader = new StreamReader (data);
string s = reader.ReadToEnd ();
Console.WriteLine (s);
data.Close ();
reader.Close ();
}
public string GetHtmlContentFromWeb()
{
string contentHtml = "";
try
{
WebClient webClient = new WebClient();
//设置用于向Internet资源的请求进行身份验证的网络凭据
webClient.Credentials = CredentialCache.DefaultCredentials;
//设置下载数据的网址URL
Byte[] pageData = webClient.DownloadData("http://www.mayb.cn");
//将网页数据转换为字符串形式,根据网页编码格式进行相应的转换
contentHtml = Encoding.Default.GetString(pageData);
//如果获取网站页面采用的是UTF-8,则使用这句
//contentHtml = Encoding.UTF8.GetString(pageData);
}
catch (WebException webEx)
{
Console.WriteLine(webEx.Message.ToString());
}
return contentHtml;
}
{
string contentHtml = "";
try
{
WebClient webClient = new WebClient();
//设置用于向Internet资源的请求进行身份验证的网络凭据
webClient.Credentials = CredentialCache.DefaultCredentials;
//设置下载数据的网址URL
Byte[] pageData = webClient.DownloadData("http://www.mayb.cn");
//将网页数据转换为字符串形式,根据网页编码格式进行相应的转换
contentHtml = Encoding.Default.GetString(pageData);
//如果获取网站页面采用的是UTF-8,则使用这句
//contentHtml = Encoding.UTF8.GetString(pageData);
}
catch (WebException webEx)
{
Console.WriteLine(webEx.Message.ToString());
}
return contentHtml;
}
2.WebBrowser
使用 WebBrowser 控件可以在 Windows 窗体应用程序中承载网页以及支持浏览器的其他文档。其中WebBrowser.Navigate 方法是将指定位置的文档加载到 WebBrowser 控件中。
public void GetHtmlContentFromWeb()
{
WebBrowser web = new WebBrowser();
web.Navigate("http://www.mayb.cn");
web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted);
}
void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
string contentHtml = "";
WebBrowser web = (WebBrowser)sender;
HtmlElementCollection ElementCollection = web.Document.GetElementsByTagName("Table");
foreach (HtmlElement item in ElementCollection)
{
contentHtml += item.InnerText;
}
}
{
WebBrowser web = new WebBrowser();
web.Navigate("http://www.mayb.cn");
web.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(web_DocumentCompleted);
}
void web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
string contentHtml = "";
WebBrowser web = (WebBrowser)sender;
HtmlElementCollection ElementCollection = web.Document.GetElementsByTagName("Table");
foreach (HtmlElement item in ElementCollection)
{
contentHtml += item.InnerText;
}
}
3.HttpWebRequest/HttpWebResponse
HttpWebRequest 类对 WebRequest 中定义的属性和方法提供支持,也对使用户能够直接与使用 HTTP 的服务器交互的附加属性和方法提供支持。
不要使用 HttpWebRequest 构造函数。使用 WebRequest..::.Create 方法初始化新的 HttpWebRequest 对象。如果统一资源标识符 (URI) 的方案是 http:// 或 https://,则 Create 返回 HttpWebRequest 对象。
对于使用 HttpWebRequest 的客户端验证身份,客户端证书必须安装在当前用户的“我的证书”存储区中。
public string GetHtmlContentFromWeb()
{
string strBuff = "";
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create("http://www.mayb.cn");
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
Stream stream = webResponse.GetResponseStream();
StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
strBuff = reader.ReadToEnd();
return strBuff;
}
{
string strBuff = "";
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create("http://www.mayb.cn");
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
Stream stream = webResponse.GetResponseStream();
StreamReader reader = new StreamReader(stream, System.Text.Encoding.GetEncoding("GB2312"));
strBuff = reader.ReadToEnd();
return strBuff;
}
private string GetWebContent(string sUrl)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(sUrl);
//声明一个HttpWebRequest请求
request.Timeout = 3000000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.ToString() != "")
{
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
}
catch (Exception exp)
{
strResult = "";
}
return strResult;
}
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(sUrl);
//声明一个HttpWebRequest请求
request.Timeout = 3000000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.ToString() != "")
{
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
}
catch (Exception exp)
{
strResult = "";
}
return strResult;
}
public string GetWebContent3(string url)
{
string strBuff = "";
char[] cbuffer = new char[256];
int byteRead = 0;
Uri httpURL = new Uri(url);
//构造HttpWebRequest
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(httpURL);
//获取响应HttpWebResponse
HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
try
{
//GetResponseStream()方法获取HTTP响应的数据流,并尝试取得URL中所指定的网页内容
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.UTF8);
//StreamReader类的Read方法依次读取网页源程序代码每一行的内容,直至行尾(读取的编码格式:UTF8)
byteRead = respStreamReader.Read(cbuffer, 0, 256);
while (byteRead != 0)
{
string strResp = new string(cbuffer, 0, byteRead);
strBuff = strBuff + strResp;
byteRead = respStreamReader.Read(cbuffer, 0, 256);
}
respStream.Close();
}
catch (Exception)
{
throw;
}
return strBuff;
}
{
string strBuff = "";
char[] cbuffer = new char[256];
int byteRead = 0;
Uri httpURL = new Uri(url);
//构造HttpWebRequest
HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(httpURL);
//获取响应HttpWebResponse
HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
try
{
//GetResponseStream()方法获取HTTP响应的数据流,并尝试取得URL中所指定的网页内容
Stream respStream = httpResp.GetResponseStream();
StreamReader respStreamReader = new StreamReader(respStream, Encoding.UTF8);
//StreamReader类的Read方法依次读取网页源程序代码每一行的内容,直至行尾(读取的编码格式:UTF8)
byteRead = respStreamReader.Read(cbuffer, 0, 256);
while (byteRead != 0)
{
string strResp = new string(cbuffer, 0, byteRead);
strBuff = strBuff + strResp;
byteRead = respStreamReader.Read(cbuffer, 0, 256);
}
respStream.Close();
}
catch (Exception)
{
throw;
}
return strBuff;
}