抓取的网站,lol英雄官方资料站点:http://lol.qq.com/web201310/info-heros.shtml
因为网页中存在ajax加载数据,直接获取源代码是取不到数据的,通过browser加载网页获取js执行后的html是一个解决办法.
通过browser控件的documentComplete事件执行数据筛选.使用过程中出现两个问题:
1.必须设置browser尺寸(高度,宽度)较大,否则js加载不完全,数据抓取不到.
2.documentComplete事件会执行两次(可能原因是有伪静态页面跳转),需要进行判断抓取内容.否则报错跳出.
抓取过程,首先进入主页获得英雄列表和详细数据页面的链接(大概有140多个英雄),然后获取链接后再次抓取140多个网址的数据.
internal void GetHtmlAfterJs(string url, string encode) { WebBrowser myBrowser = new WebBrowser(); myBrowser.Url = new Uri(url); myBrowser.DocumentCompleted += MyBrowser_DocumentCompleted; } private void MyBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { List<HeroLink> _heroLinks = new List<HeroLink>(); var b = sender as WebBrowser; string final_html = b.Document.Body.InnerHtml; var Ul_div = b.Document.GetElementById("jSearchHeroDiv"); foreach (HtmlElement li_item in Ul_div.Children) { HeroLink heroLink = new HeroLink(); var lnk = li_item.Children[0]; heroLink.DisplayName = lnk.InnerText; heroLink.LinkUrl = lnk.GetAttribute("href"); foreach (HtmlElement ele in lnk.Children) { if (ele.TagName == "IMG") { heroLink.ImgSrc = ele.GetAttribute("src"); break; } } hero_count += 1; Debug.WriteLine(String.Format("列表中获取的第{0}个英雄链接,英雄名字:{1},链接地址:{2}", hero_count, heroLink.DisplayName ,heroLink.LinkUrl)); _heroLinks.Add(heroLink); } //Parallel.ForEach<HeroLink>(heroLinks, // (heroLink) => { // LoadPage(heroLink); // }); foreach (HeroLink heroLink in _heroLinks) { if (heroLink.LinkUrl == "") { return; } WebBrowser myBrowser = new WebBrowser(); myBrowser.Height = 2000; myBrowser.Tag = heroLink; UserAgentHelper.AppendUserAgent(""); myBrowser.ScriptErrorsSuppressed = true; myBrowser.Url = new Uri(heroLink.LinkUrl); myBrowser.DocumentCompleted += MyBrowser_DocumentCompleted_Detail; //myBrowser.Navigate(new Uri(heroLink.LinkUrl), "", null, "Accept-Language:zh-CN,q=0.5\nMozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; rv:11.0) like Gecko"); //myBrowser.Document.Cookie= //var acceptLanguageHeader = "Accept-Language:zh-CN,q=0.5\nUser-Agent:MyCoustomBrowser"; //myBrowser.Navigate(new Uri(heroLink.LinkUrl), null, null, acceptLanguageHeader); } b.Dispose(); }
进入详细页再次抓取.
private void MyBrowser_DocumentCompleted_Detail(object sender, WebBrowserDocumentCompletedEventArgs e)
{
var b = sender as WebBrowser;
var heroLink = (HeroLink)b.Tag;
if (!e.Url.Equals(b.Url))
{
Debug.WriteLine(String.Format("网页进行了重定向,是想要的"));
//return;
}
else
{
Debug.WriteLine(String.Format("网页没有重定向,可能不是想要的"));
//return;
}
var heroDetail = new HeroDetail();
heroDetail.HeroName = b.Document.GetElementById("DATAname").InnerText;
heroDetail.PersonName = b.Document.GetElementById("DATATitle").InnerText;
string text = heroDetail.HeroName;
if (!Regex.IsMatch(text, @"[\u4e00-\u9fa5]"))
{
Debug.WriteLine(String.Format("抓取到了不合格的网页"));
return;
}
else
{
HeroCount += 1;
Debug.WriteLine(String.Format("这是抓取成功的第{0}个英雄", HeroCount));
}
var tagContainer = b.Document.GetElementById("DATAtags");
foreach (HtmlElement span in tagContainer.Children)
{
heroDetail.Types.Add(span.InnerText);
}
var abilityContainer= b.Document.GetElementById("DATAinfo");
string[] data_info_header = new string[] { "物理攻击","魔法攻击","防御能力", "上手难度" };
foreach (HtmlElement dd in abilityContainer.Children)
{
if (dd.TagName=="DD")
{
string str_width = dd.InnerHtml;
int y1= str_width.IndexOf("up up");
int i = Convert.ToInt32(str_width.Substring(y1 +5, 1))-1;
int x1 = str_width.IndexOf(":");
int x2 = str_width.IndexOf("%");
str_width = str_width.Substring(x1+1,x2-x1-1);
Debug.WriteLine(heroDetail.HeroName+str_width+b.Url+heroDetail.PersonName);
Ability ability = new Ability();
ability.AbilityName = data_info_header[i];
ability.AbilityValue = Convert.ToDouble(str_width);
heroDetail.Abilitys.Add(ability);
}
}
var skinNAV = b.Document.GetElementById("skinNAV");
foreach (HtmlElement li in skinNAV.Children)
{
Skin skin = new Skin();
skin.SkinName=li.Children[0].GetAttribute("title");
skin.SkinImgUrl = li.Children[0].Children[0].GetAttribute("src").Replace("small","big"); ;
heroDetail.Skins.Add(skin);
Debug.WriteLine(String.Format("皮肤{0}的图片Url:{1}", skin.SkinName, skin.SkinImgUrl));
}
//var li_skin= b.Document.GetElementById("skinNAV").Children[1].Click();
//背景故事
heroDetail.Story= b.Document.GetElementById("DATAlore").InnerText;
Debug.WriteLine(heroDetail.Story+"???");
//技能图标
var skill_Ul = b.Document.GetElementById("DATAspellsNAV");
foreach (HtmlElement li in skill_Ul.Children)
{
Skill skill = new Skill();
skill.SkillImageUrl= li.Children[0].GetAttribute("src");
heroDetail.Skills.Add(skill);
}
//使用技巧
heroDetail.TipIfUse = b.Document.GetElementById("DATAallytips").InnerText;
heroDetail.TipIfRival = b.Document.GetElementById("DATAenemytips").InnerText;
Debug.WriteLine(tagContainer.InnerHtml);
heroLink.Detail = heroDetail;
heroLinks.Add(heroLink);
if (HeroCount>=hero_count-1)
{
string OutputXmlString = "";
using (MemoryStream ms = new MemoryStream())
{
var setting = new XmlWriterSettings()
{
Encoding = new UTF8Encoding(false),
Indent = true,
};
using (XmlWriter writer = XmlWriter.Create(ms, setting))
{
XmlSerializer xmlSearializer = new XmlSerializer(typeof(List<HeroLink>));
xmlSearializer.Serialize(writer, heroLinks);
OutputXmlString = Encoding.UTF8.GetString(ms.ToArray());
}
}
textBox1.Text = OutputXmlString;
//Debug.WriteLine(OutputXmlString);
}
b.Dispose();
}
最后将根据我建立的对象序列化,得到结果如下:数据较多上传到github
https://github.com/songshizhao/Catcher/blob/master/README.md