How to use HTML Agility pack : Parsing HTML Documents with the Html Agility Pack to Extract Content Node and Replace Content

Download From: http://htmlagilitypack.codeplex.com/

In your application, add a reference to HTMLAgilityPack.dll in the HTMLAgilityPack\Debug (or Realease) \bin folder.

Then, as an example:

HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();

    // There are various options, set as needed
    htmlDoc.OptionFixNestedTags=true;

    // filePath is a path to a file containing the html
    htmlDoc.Load(filePath);

    // Use:  htmlDoc.LoadHtml(xmlString);  
to load from a string (was htmlDoc.LoadXML(xmlString)

   // ParseErrors is an ArrayList containing any errors from the Load statement
   if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0)
   {
       // Handle any parse errors as required

   }
   else
   {

        if (htmlDoc.DocumentNode != null)
        {
            HtmlAgilityPack.HtmlNode bodyNode = 
htmlDoc.DocumentNode.SelectSingleNode("//body");

            if (bodyNode != null)
            {
                // Do something with bodyNode
            }
        }
    }
<Extract only content from the Webpage not HTML


// use the html agility pack: http://www.codeplex.com/htmlagilitypack
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(ExceptHeadTag);
//StringBuilder sb = new StringBuilder();
//foreach (HtmlTextNode node in doc.DocumentNode.SelectNodes(“//text()”))
//{
// sb.AppendLine(node.Text);
//}
//string final = sb.ToString();

var root = doc.DocumentNode;
var sb = new StringBuilder();
//foreach (var node in root.DescendantNodesAndSelf())
foreach (HtmlNode node in doc.DocumentNode.SelectNodes(“//text()”))
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}

string final = sb.ToString();

<p>Replace Only Node Content</p>

/**///http://social.msdn.microsoft.com/Forums/en-US/regexp/thread/beae72d6-844f-4a9b-ad56-82869d685037/
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(Html);

if (dt.Rows.Count > 0)
{
for (Int32 n = 0; n < dt.Rows.Count; n++)
{
String MainW = dt.Rows[n][“MainWord”].ToString().Trim();
String BanW = dt.Rows[n][“BanglaWord”].ToString().Trim();

string boundedKeyword = @”\b” + Regex.Escape(MainW) + @”\b”;

if (BanW != “”)
{
//var nodes = doc.DocumentNode.SelectNodes(“//text()[not(ancestor::a) and not(ancestor::h1) and not(ancestor::h2) and not(ancestor::head)]”) ?? new HtmlNodeCollection(null);
var nodes = doc.DocumentNode.SelectNodes(“//text()[not(ancestor::h1) and not(ancestor::h2) and not(ancestor::head)]”) ?? new HtmlNodeCollection(null);
foreach (var node in nodes)
{
node.InnerHtml = Regex.Replace(node.InnerHtml, boundedKeyword, (MainW +”( “+ BanW+” )”), RegexOptions.IgnoreCase);
}
//Html = doc.DocumentNode.OuterHtml;
//doc.LoadHtml(Html);
}
}
}

Full Code

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using System.Data.SqlClient;
using System.Data;
namespace bBrowser
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{

}

protected void btnBrowse_Click(object sender, EventArgs e)
{
string url = “http://&#8221; + TextBoxURL.Text;

WebRequest webReq = WebRequest.Create(url);
WebResponse webRes = webReq.GetResponse();

using (StreamReader reader = new StreamReader(webRes.GetResponseStream()))
{
}
WebClient client = new WebClient();

byte[] byteData = null;
byteData = client.DownloadData(url);

UTF8Encoding UTF8Encod = new UTF8Encoding();

String str2 = scrapeIt(url);

//divContent.InnerText = str2;
CreateVectorAndConvertToBangla(UTF8Encod.GetString(byteData), str2);
}

public void tmpHTMLPageCache(String html)
{
Random rand = new Random((int)DateTime.Now.Ticks);
int RandomNumber = rand.Next(0, 100000);

DateTime cdt = DateTime.Now;

String HtmlFileName = Convert.ToString(cdt.Year) + Convert.ToString(cdt.Month) + Convert.ToString(cdt.Day) +
Convert.ToString(cdt.Hour) + Convert.ToString(cdt.Minute) + Convert.ToString(cdt.Second) +
Convert.ToString(cdt.Millisecond) + RandomNumber.ToString();

String Location = AppDomain.CurrentDomain.BaseDirectory;
String FileName = Location + “//Cache//” + HtmlFileName + “.html”;
// Create a writer and open the file:
StreamWriter StrmHtml;
if (!File.Exists(FileName))
{
StrmHtml = new StreamWriter(FileName);
//StrmHtml.WriteLine(html);
}
else
{
StrmHtml = File.AppendText(FileName);
}
// Write to the file:
StrmHtml.WriteLine(html);
StrmHtml.WriteLine();
// Close the stream:
StrmHtml.Close();

this.Iframe.Attributes[“src”] = “Cache/” + HtmlFileName + “.html”;
}

public void CreateVectorAndConvertToBangla(String Html, String myString)
{
List<string> myList = myString.Split(‘ ‘).ToList();
Int32 Count = myList.Count;
// Get distinct elements and convert into a list again.
myList = myList.Distinct().ToList();
Count = myList.Count;

DataTable dt = new DataTable();
dt.Columns.Add(“MainWord”, typeof(string));
dt.Columns.Add(“BanglaWord”, typeof(string));

for (Int32 i = 0; i < Count; i++)
{
String enMainWord = myList[i].ToString().Trim();
String banWord = BindDataList(enMainWord).ToString();
if (banWord != “”)
{
DataRow dRow = dt.NewRow();
dRow[“MainWord”] = enMainWord.ToString();
dRow[“BanglaWord”] = banWord.ToString();
dt.Rows.Add(dRow);
}
}

//if (dt.Rows.Count > 0)
//{
// for (Int32 n = 0; n < dt.Rows.Count; n++)
// {
// String MainW = dt.Rows[n][“MainWord”].ToString().Trim();
// String BanW = dt.Rows[n][“BanglaWord”].ToString().Trim();

// if (BanW != “”)
// {
// Html = ReplaceWholeWord(Html, MainW, (MainW + ” (” + BanW + “)”));
// }
// }
//}
/**///http://social.msdn.microsoft.com/Forums/en-US/regexp/thread/beae72d6-844f-4a9b-ad56-82869d685037/
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(Html);

if (dt.Rows.Count > 0)
{
for (Int32 n = 0; n < dt.Rows.Count; n++)
{
String MainW = dt.Rows[n][“MainWord”].ToString().Trim();
String BanW = dt.Rows[n][“BanglaWord”].ToString().Trim();

string boundedKeyword = @”\b” + Regex.Escape(MainW) + @”\b”;

if (BanW != “”)
{
//var nodes = doc.DocumentNode.SelectNodes(“//text()[not(ancestor::a) and not(ancestor::h1) and not(ancestor::h2) and not(ancestor::head)]”) ?? new HtmlNodeCollection(null);
var nodes = doc.DocumentNode.SelectNodes(“//text()[not(ancestor::h1) and not(ancestor::h2) and not(ancestor::head)]”) ?? new HtmlNodeCollection(null);
foreach (var node in nodes)
{
node.InnerHtml = Regex.Replace(node.InnerHtml, boundedKeyword, (MainW +”( “+ BanW+” )”), RegexOptions.IgnoreCase);
}
//Html = doc.DocumentNode.OuterHtml;
//doc.LoadHtml(Html);
}
}
}

/**/

//tmpHTMLPageCache(Html);
tmpHTMLPageCache(doc.DocumentNode.OuterHtml);
}

// Replace full match word
public string ReplaceWholeWord(string original, string wordToFind, string replacement)
{
string pattern = String.Format(@”\b{0}\b”, wordToFind);
string ret = Regex.Replace(original, pattern, replacement, RegexOptions.IgnoreCase);
return ret;
}

// Return data with respect to query
public string BindDataList(String MainWord)
{
String ConStr = “Data Source=172.16.137.22;Initial Catalog=dbBanglaBrowser;Persist Security Info=True;User ID=sa;Password=KueT!@#”;
SqlConnection con = new SqlConnection(ConStr);
con.Open();
SqlCommand command = new SqlCommand(“SELECT isNull(BengaliMeaning,”) as BengaliMeaning FROM T_Dictionary WHERE MainWord='” + MainWord + “‘”, con);
SqlDataAdapter da = new SqlDataAdapter(command);
DataSet ds = new DataSet();
DataTable dt = new DataTable();
da.Fill(ds);
dt = ds.Tables[0];
con.Close();
String rtnStr = “”;
try
{
rtnStr = dt.Rows[0][“BengaliMeaning”].ToString();
}
catch (Exception ex) { rtnStr = “”; }
return rtnStr;
}

public static string scrapeIt(string siteToScrape)
{
string HTML = getHTML(siteToScrape);
string text = stripCode(HTML);
return text;
}

public static string getHTML(string siteToScrape)
{
string response = “”;
HttpWebResponse objResponse;
HttpWebRequest objRequest = (HttpWebRequest)WebRequest.Create(siteToScrape);
objRequest.UserAgent = “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)”;
objResponse = (HttpWebResponse)objRequest.GetResponse();
using (StreamReader sr = new StreamReader(objResponse.GetResponseStream()))
{
response = sr.ReadToEnd();
sr.Close();
}
return response;
}

public static string stripCode(string the_html)
{
// Remove google analytics code and other JS
the_html = Regex.Replace(the_html, “<script.*?</script>”, ” “,
RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Remove inline stylesheets
the_html = Regex.Replace(the_html, “<style.*?</style>”, ” “,
RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Remove HTML tags
the_html = Regex.Replace(the_html, “</?[a-z][a-z0-9]*[^<>]*>”, ” “);
// Remove HTML comments
the_html = Regex.Replace(the_html, “<!–(.|\\s)*?–>”, ” “);
// Remove Doctype
the_html = Regex.Replace(the_html, “<!(.|\\s)*?>”, ” “);
// Remove excessive whitespace
the_html = Regex.Replace(the_html, “[\t\r\n]”, ” “);
//finding the HTML tags and replacing them with an empty string
the_html = Regex.Replace(the_html, @”<[^>]*?>|<[^>]*>”, ” “);
// Removing special character
the_html = Regex.Replace(the_html, @”\r|\n|\t|(&nbsp);|(&quot);|[‘””;@!,&?%\.*:#/\\-]|—|…|→|0|1|2|3|4|5|6|7|8|9″, ” “);

return the_html;
}

protected void lbLearned_Click(object sender, EventArgs e)
{

}

protected void lbLearning_Click(object sender, EventArgs e)
{

}

protected void lbLogout_Click(object sender, EventArgs e)
{

}

protected void ddlEnglishLevel_SelectedIndexChanged(object sender, EventArgs e)
{

}
}
}

Advertisements

One thought on “How to use HTML Agility pack : Parsing HTML Documents with the Html Agility Pack to Extract Content Node and Replace Content

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s