Using the HTML Agility Package to modify web pages

Step 1: Create a new Visual Studio project

Create a new Console application:

Step 2: Use the Package Manager console to install the HTML Agility package.

Select Tools > NuGet Package Manager > Package Manager Consol

Enter: Install-Package HtmlAgilityPack

Step 3: Add the reference to the HtmlAgilityPack.dll

Right click you References folder, and browse to the HtmlAgilityPack.dll file that gets installed to the ‘packages’ folder in your Visual Studio project.

Select and add this file:

Step 4: Try it on some examples

1. Insert the ‘DOCTYPE’ comment before the main tag and insert a “

Program.cs

namespace HtmlAgilityPack
{
   internal class Program
   {
      private static void Main(string[] args)
      {
         const string script = @"<script src=""../resources/javascript/stopExecutionOnTimeout.js""></script>";
         const string comment = "<!DOCTYPE html>\n";

         var htmlDoc = new HtmlDocument();
         htmlDoc.Load(@"C:\dump\html.html");

         var commentNode = htmlDoc.CreateComment(comment);
         var scriptNode = HtmlNode.CreateNode(script);

         var htmlNode = htmlDoc.DocumentNode.SelectSingleNode("html");
         htmlDoc.DocumentNode.InsertBefore(commentNode, htmlNode);

         var headNode = htmlDoc.DocumentNode.SelectSingleNode("//head");
         headNode.AppendChild(scriptNode);

         htmlDoc.Save(@"C:\dump\html.html");
      }
   }
}

HTML - before

<html>
	<head>	
	</head>
	
	<body>	
		<article id="main">
			<h1>Hello</h1>		
		</article>
	</body>
</html>

HTML - after

<!DOCTYPE html>
<html>
	<head>
	
	<script src="../resources/javascript/stopExecutionOnTimeout.js"></script></head>
	
	<body>	
		<article id="main">
			<h1>Hello</h1>		
		</article>
	</body>
</html>

2. Insert a new "div" element in between an existing "article" element.

Program.cs

using System.Collections.Generic;

namespace HtmlAgilityPack
{
   internal class Program
   {
      static HtmlNode CloneAsParentNode(HtmlNodeCollection nodes, string name)
      {
         var clones = new List<HtmlNode>(nodes);
         var parent = nodes[0].ParentNode;       
         var newParent = nodes[0].OwnerDocument.CreateElement(name);
       
         parent.InsertBefore(newParent, nodes[0]);
        
         foreach (var node in clones)
         {
            var clone = node.CloneNode(true);
            newParent.AppendChild(clone);
         }
        
         foreach (var node in clones)
         {
            parent.RemoveChild(node);
         }

         return newParent;
      }

      private static void Main(string[] args)
      {       
         var htmlDoc = new HtmlDocument();
         htmlDoc.Load(@"C:\dump\html.html");
   
         var sectionNode = HtmlNode.CreateNode("div");
         var articleNode = htmlDoc.DocumentNode.SelectSingleNode("//article");
                                 
         CloneAsParentNode(articleNode.ChildNodes, sectionNode.InnerHtml);

         var divPath = "//article[@id=\'main\']/div";         
         var newSection = htmlDoc.DocumentNode.SelectSingleNode(divPath);

         newSection.SetAttributeValue("class", "section");

         htmlDoc.Save(@"C:\dump\html.html");
      }
   }
}

HTML - before

<html>
    <body>   
        <article id="main">
            <h1>Hello</h1>      
        </article>
    </body>
</html>

HTML - after

<html>
    <body>   
        <article id="main">
			<div class="section">
				<h1>Hello</h1>      
			</div>
		</article>
    </body>
</html>

Leave a Reply