Below I have created code that avoids duplication of data headers. When creating a data table, each column must have a unique name. In addition, there are times when an HTML row may go out of bounds, and you need to add additional columns to the data table, otherwise you will drop the data. that was my decision.
''' public enum DuplicateHeaderReplacementStrategy { AppendAlpha, AppendNumeric, Delete } public class HtmlServices { private static readonly string[] Alpha = new[] { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" }; public static HtmlDocument RenameDuplicateHeaders(HtmlDocument doc, DuplicateHeaderReplacementStrategy strategy) { var index = 0; try { foreach (HtmlNode table in doc.DocumentNode?.SelectNodes("//table")) { var tableHeaders = table.SelectNodes("th")? .GroupBy(x => x)? .Where(g => g.Count() > 1)? .ToList(); tableHeaders?.ForEach(y => { switch (strategy) { case DuplicateHeaderReplacementStrategy.AppendNumeric: y.Key.InnerHtml += index++; break; case DuplicateHeaderReplacementStrategy.AppendAlpha: y.Key.InnerHtml += Alpha[index++]; break; case DuplicateHeaderReplacementStrategy.Delete: y.Key.InnerHtml = string.Empty; break; } }); } return doc; } catch { return doc; } } } public static DataTable GetDataTableFromHtmlTable(string url, string[] htmlIds) { ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12; HtmlWeb web = new HtmlWeb(); HtmlDocument doc = web.Load(url); string html = doc.DocumentNode.OuterHtml; doc = HtmlServices.RenameDuplicateHeaders(doc, DuplicateHeaderReplacementStrategy.AppendNumeric); var headers = doc.DocumentNode.SelectNodes("//tr/th"); DataTable table = new DataTable(); foreach (HtmlNode header in headers) if (!table.ColumnExists(header.InnerText)) { table.Columns.Add(header.InnerText);
source share