功能介绍:
这是一款ASP.NET的网络抓取数据的程序,有着较好的界面功能。
通过搜索网站或贴吧,根据你需求的关键字抓取网站上的信息,搜索可做扩展!
页面搜索功能设置:搜索网站选择框,采集信息的显示条数及关键字输入框;
显示列表:编号,信息来源,标题,抓取内容,点击率,抓取时间等功能! protected void btnCrawl_Click(object sender, EventArgs e)
{
btnCrawl.Enabled = false;
string news = PageOperate.GetNullToString(ddlNewsSource.SelectedValue);
string words = PageOperate.GetNullToString(txtWords.Text.Trim());
int nums = PageOperate.GetIntValue(ddlNums.SelectedValue);
if (news == "")
{
PageOperate.AlertAndRedirect("请选择新闻来源!", "Default.aspx");
return;
}
if (words == "")
{
PageOperate.AlertAndRedirect("请填写关键字!", "Default.aspx");
return;
}
if (nums == 0)
{
PageOperate.AlertAndRedirect("请选择抓取的数量!", "Default.aspx");
return;
}
//要抓取的url地址
string url = "";
//抓取到url地址页面的html
string html = "";
//当前页数
int p = 1;
//抓取到的数量
int crawlNum = 0;
//循环参数
bool flag = true;
//百度贴吧参数 相当于当前页 以50为单位递增
int pn = 0;
lblShow.Text = "正在抓取.....";
#region 抓取数据
if (news == "新浪")
{
try
{
//对关键字进行url编码,防止出现乱码
words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
do
{
url = "http://search.sina.com.cn/?q=" words "&range=title&c=news&sort=time&col=&source=&from=&country=&size=&time=&a=&page=" p "&pf=2131425478&ps=2134309112&dpc=1";
html = GetHtml(url);
if (html != "error")
{
string matHtml = ResolverAndOutput(html, "", "", "<h2><a href=\"http://(?<content>. ?)</span></h2>", 1, false);
string[] itemArray = matHtml.Replace("~", "").Split('$');
//flag = false; //itemArray.Length > nums;
//循环获取标题
for (int j = 0; j < itemArray.Length - 1; j )
{
lblShow.Text = "分析到第" p " 页,第" (j 1) "条数据,已采集" crawlNum "条数据!";
//抓取到的数量如果与选择的数量一致,则退出抓取
if (crawlNum == nums)
{
flag = false;
break;
}
//标题 标题清除html标签
string title = CutString(itemArray[j], "target=\"_blank\">", "</a>");
title = PageOperate.CutHTML(title).Trim();
//判断此标题是否已经添加到数据库
DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" title "'");
if (dt.Rows.Count > 0)
continue;
//内容页链接
string conUrl = CutString(itemArray[j], "<h2><a href=\"", "\" target=\"_blank\"");
int splitIndex = conUrl.LastIndexOf('/');
if (splitIndex < 0)
continue;
//根据内容页链接,获取内容
string conPage = GetHtml(conUrl);
if (conPage != "error")
{
string conHtml = "";
int endIndex = 0;
int isExist = conPage.IndexOf("<div id=\"divContent\"");
if (isExist > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"divContent\"(?<content>. ?)<div id=\"divAttachment\">", 1, false);
endIndex = conHtml.IndexOf("<div id=\"divAttachment\">");
}
else
{
int tempindex = conPage.IndexOf("<div class=\"blkContainerSblkCon BSHARE_POP\"");
if (tempindex > 0)
{
if (conPage.IndexOf("<div class=\"se_edit\"") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>. ?)<div class=\"se_edit\"", 1, false);
endIndex = conHtml.IndexOf("<div class=\"se_edit\"");
}
else if (conPage.IndexOf("<div class=\"wb_rec\" id=\"wb_rec\" style") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>. ?)<div class=\"wb_rec\" id=\"wb_rec\" style", 1, false);
endIndex = conHtml.IndexOf("<div class=\"wb_rec\" id=\"wb_rec\" style");
}
else if (conPage.IndexOf("<iframe width=\"100%\" scrolling=\"no\" height=\"35\"") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>. ?)<iframe width=\"100%\" scrolling=\"no\" height=\"35\"", 1, false);
endIndex = conHtml.IndexOf("<iframe width=\"100%\" scrolling=\"no\" height=\"35\"");
}
else
continue;
}
else
continue;
}
if (endIndex < 0)
continue;
conHtml = conHtml.Substring(0, endIndex);
conHtml = Server.HtmlEncode(conHtml);
int result = AddData("新浪", title, conHtml, "Corp");
if (result > 0)
crawlNum ;
}
}
}
else
{
flag = false;
break;
}
p ;
} while (flag);
}
catch
{
}
}
else if (news == "百度贴吧")
{
try
{
//对关键字进行url编码,防止出现乱码
words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
do
{
url = "http://tieba.baidu.com/f?kw=" words "&pn=" pn;
html = GetHtml(url);
if (html != "error")
{
string matHtml = ResolverAndOutput(html, "", "", "<div class=\"threadlist_text threadlist_title(?<content>. ?)</a><span ></span></div>", 1, false);
string[] itemArray = matHtml.Replace("~", "").Split('$');
//flag = false; //itemArray.Length > nums;
//循环获取标题
for (int j = 0; j < itemArray.Length - 1; j )
{
lblShow.Text = "分析到第" ((pn / 50) 1) " 页,第" (j 1) "条数据,已采集" crawlNum "条数据!";
//抓取到的数量如果与选择的数量一致,则退出抓取
if (crawlNum == nums)
{
flag = false;
break;
}
//标题 标题清除html标签
string title = CutString(itemArray[j], "class=\"j_th_tit\">", "</a>");
title = PageOperate.CutHTML(title).Trim();
//判断此标题是否已经添加到数据库
DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" title "'");
if (dt.Rows.Count > 0)
continue;
//内容页链接
string conUrl = CutString(itemArray[j], "<a href=\"", "\" title=\"");
conUrl = "http://tieba.baidu.com" conUrl;
int splitIndex = conUrl.LastIndexOf('/');
if (splitIndex < 0)
continue;
//根据内容页链接,获取内容
string conPage = GetHtml(conUrl);
if (conPage != "error")
{
string conHtml = "";
int isExist = conPage.IndexOf("<cc><div id=\"post_content_");
if (isExist > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<cc><div id=\"post_content_(?<content>. ?)</div></cc>", 1, false);
string[] conArray = conHtml.Replace("~", "").Split('$');
conHtml = conArray[0];
}
else
continue;
conHtml = Server.HtmlEncode(conHtml);
int result = AddData("百度贴吧", title, conHtml, "Corp");
if (result > 0)
crawlNum ;
}
}
}
else
{
flag = false;
break;
}
pn = pn 50;
} while (flag);
}
catch
{
}
}
else if (news == "搜狗")
{
try
{
//对关键字进行url编码,防止出现乱码
words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
do
{
url = "http://news.sogou.com/news?query=" words "&sut=2543&sst0=1396574960819&mode=2&x=30&y=9&page=" p "&w=01029901&dr=1";
html = GetHtml(url);
if (html != "error")
{
string matHtml = ResolverAndOutput(html, "", "", "<h3 class=\"pt\">(?<content>. ?)</h3>", 1, false);
string[] itemArray = matHtml.Replace("~", "").Split('$');
//flag = false; //itemArray.Length > nums;
//循环获取标题
for (int j = 0; j < itemArray.Length - 1; j )
{
lblShow.Text = "分析到第" p " 页,第" (j 1) "条数据,已采集" crawlNum "条数据!";
//抓取到的数量如果与选择的数量一致,则退出抓取
if (crawlNum == nums)
{
flag = false;
break;
}
//标题 标题清除html标签
string title = CutString(itemArray[j], "target=\"_blank\">", "</a>");
title = PageOperate.CutHTML(title).Trim();
//判断此标题是否已经添加到数据库
DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" title "'");
if (dt.Rows.Count > 0)
continue;
//内容页链接
string conUrl = CutString(itemArray[j], "<a class=\"pp\" href=\"", "\" id=\"uigs_");
int splitIndex = conUrl.LastIndexOf('/');
if (splitIndex < 0)
continue;
//根据内容页链接,获取内容
string conPage = GetHtml(conUrl);
if (conPage != "error")
{
string conHtml = "";
int endIndex = 0;
//搜狐新闻
if (conPage.IndexOf("<div class=\"text clear\" id=\"contentText\"") > 0)
{
if (conPage.IndexOf("<div class=\"autoShare clear\">") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"text clear\" id=\"contentText\"(?<content>. ?)<div class=\"autoShare clear\">", 1, false);
endIndex = conHtml.IndexOf("<div class=\"autoShare clear\">");
}
else if (conPage.IndexOf("<div class=\"original-title\">") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"text clear\" id=\"contentText\"(?<content>. ?)<div class=\"original-title\">", 1, false);
endIndex = conHtml.IndexOf("<div class=\"original-title\">");
}
else
continue;
}//腾讯新闻
else if (conPage.IndexOf("<div id=\"Cnt-Main-Article-QQ\"") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"Cnt-Main-Article-QQ\"(?<content>. ?)<span style=\"width:0;height:0;", 1, false);
endIndex = conHtml.IndexOf("<span style=\"width:0;height:0;");
}//网易新闻
else if (conPage.IndexOf("<div id=\"endText\">") > 0)
{
conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"endText\">(?<content>. ?)<div class=\"sharecommend-wrap clearfix\">", 1, false);
endIndex = conHtml.IndexOf("<div class=\"sharecommend-wrap clearfix\">");
}
else
continue;
if (endIndex < 0)
continue;
conHtml = conHtml.Substring(0, endIndex);
conHtml = Server.HtmlEncode(conHtml);
int result = AddData("搜狗", title, conHtml, "Corp");
if (result > 0)
crawlNum ;
}
}
}
else
{
flag = false;
break;
}
p ;
} while (flag);
}
catch
{
lblShow.Text = "抓取数据出现异常!";
lblShow.ForeColor = System.Drawing.Color.Red;
return;
}
}
//最新10条
BindData(repTop10, 10);
//数据列表
BindData(repData, 1000);
btnCrawl.Enabled = true;
#endregion
}
评论