我已经用过很多种方式来判断了,但效果都不怎么样?在未知网页的编码方式时,请问怎么才可以正常下载到网页,而不是乱码呢?请大家指教下
#region【获取网页HTML文本】
/// <summary>
/// 获取url网页的HTML文本信息
/// </summary>
/// <param name="url">网页URL</param>
/// <param name="codeType">编码方式</param>
/// <returns>返回HTML文本字符串</returns>
public static String GetResponseText(string url, string codeType)
{
string responseFromServer = null;
Stream dataStream = null;
StreamReader reader = null;
try
{
WebRequest request = WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusDescription == "OK")
{
try
{
dataStream = response.GetResponseStream();
reader = new StreamReader(dataStream, GetPageEncoding(url));
responseFromServer = reader.ReadToEnd();
Regex rex = new Regex(@"(?<=charset\s*=\s*)[^""]*?(?="")", RegexOptions.IgnoreCase);
string charset = rex.Match(responseFromServer, 0).Value;
if (!charset.Equals("utf-8")) //如果编码方式不是utf-8的话,则重新用默认方式下载网页
{
reader = new StreamReader(dataStream, Encoding.Default);
responseFromServer = reader.ReadToEnd();
}
}
finally
{
reader.Close();
dataStream.Close();
}
}
response.Close();
return responseFromServer;
}
catch (Exception ex)
{
return ex.Message;
}
}
#endregion
2 回答
- 2 回答
- 0 关注
- 399 浏览
添加回答
举报
0/150
提交
取消