C# Web Browser control: How to grab html from selection

50 Views Asked by At

I have a winform project where i am using web browser control which load a site. the site has many tabular data which is a html table. user will select large text from web browser control using their mouse. the select may have many data including multiple tabular data which is nothing but a html table.

I know how to get text from selection. this is sample code which return text.

private string GetSelectedText()
{
    dynamic document = webBrowser1.Document.DomDocument;
    dynamic selection = document.selection;
    dynamic text = selection.createRange().text;
    return (string)text;
}

But i need html of selected area on web browser control programmatically. i use a code sample which suppose to return html of selected portion of web page loaded into web browser control.....but no luck.

here i am sharing that code which not working as expected. please see my code and tell me how could grab the html content from web browser control of large selection ?

here is the code which is not working.

using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Windows.Forms;

namespace HtmlTableParser
{
    public partial class Form2 : Form
    {
        private WebBrowser webBrowser1;
        public Form2()
        {
            InitializeComponent();

            Button btn = new Button();
            btn.Text = "Test";
            btn.Click += button1_Click;
            this.Controls.Add(btn);

            var panel = new Panel();
            panel.Top = btn.Height + 2;
            panel.Height = this.ClientSize.Height - btn.Height + 2;
            panel.Width = this.ClientSize.Width;
            panel.Anchor = AnchorStyles.Left | AnchorStyles.Right | AnchorStyles.Top | AnchorStyles.Bottom;

            webBrowser1 = new WebBrowser();
            webBrowser1.Dock = DockStyle.Fill;
            webBrowser1.Url = new Uri("https://www.sec.gov/Archives/edgar/data/1108134/000110813423000018/bhlb-20230630.htm");

            panel.Controls.Add(webBrowser1);
            this.Controls.Add(panel);

        }

        private void button1_Click(object sender, EventArgs e)
        {
            TestSelection();
            TestAllTable();
        }

        private void TestSelection()
        {
            var domdoc = this.webBrowser1.Document.DomDocument as mshtml.IHTMLDocument2;
            var sel = domdoc.selection;
            var range = sel.createRange();
            var trange = range as mshtml.IHTMLTxtRange;

            var table = GetParentTable(trange.parentElement());
            if (table == null)
            {
                var startPointRange = trange.duplicate();
                startPointRange.setEndPoint("EndToStart", trange);
                var startPointTable = GetParentTable(startPointRange.parentElement());

                var endPointRange = trange.duplicate();
                startPointRange.setEndPoint("StartToEnd", trange);
                var endPointTable = GetParentTable(endPointRange.parentElement());

                if (startPointTable != null)
                {
                    table = startPointTable;
                }
                else if (endPointTable != null)
                {
                    table = endPointTable;
                }
                else
                {
                    MessageBox.Show("Selection is not in Table");
                    return;
                }
            }

            var tableData = TableData.GetTableData(table);

            System.Diagnostics.Debug.WriteLine(tableData.ToString());
        }

        private mshtml.IHTMLTable GetParentTable(mshtml.IHTMLElement element)
        {
            var parent = element;
            while (parent != null)
            {
                if (parent is mshtml.IHTMLTable table)
                {
                    return table;
                }
                parent = parent.parentElement;
            }
            return null;
        }

        private void TestAllTable()
        {
            var domdoc = this.webBrowser1.Document.DomDocument as mshtml.HTMLDocument;
            foreach (var table in domdoc.getElementsByTagName("table").OfType<mshtml.IHTMLTable>())
            {

                var tableData = TableData.GetTableData(table);

                System.Diagnostics.Debug.WriteLine(tableData.ToString());
                System.Diagnostics.Debug.WriteLine(new string('=', 20));
            }
        }

    }



    class TableData
    {
        public static TableData GetTableData(mshtml.IHTMLTable table)
        {
            TableData tableData = new TableData();

            foreach (var tableRow in table.rows.OfType<mshtml.IHTMLTableRow>())
            {
                RowData rowdata = new RowData();
                foreach (var tablecell in tableRow.cells.OfType<mshtml.HTMLTableCell>())
                {
                    CellData cell = new CellData();
                    cell.Text = tablecell.innerText;
                    cell.RowSpan = tablecell.rowSpan;
                    cell.ColSpan = tablecell.colSpan;
                    rowdata.Add(cell);
                }

                tableData.Rows.Add(rowdata);
            }

            return tableData;
        }

        public List<RowData> Rows { get; } = new List<RowData>();


        public override string ToString()
        {
            System.Text.StringBuilder sb = new StringBuilder();
            foreach (var row in this.Rows)
            {
                sb.AppendLine(row.ToString());
            }
            return sb.ToString();
        }
    }

    class RowData : List<CellData>
    {
        public override string ToString()
        {
            return string.Join("\t", this.Select(cell => cell.Text + new string('\t', cell.ColSpan)));
        }
    }

    class CellData
    {
        public string Text { get; set; }
        public int ColSpan { get; set; }
        public int RowSpan { get; set; }

        public override string ToString() => Text;
    }
}

Here i am pasting a image which show how user will selected the portion of page. Sample Image

It is my request that for last few days i have tried many approach to get html of selection portion from web browser control....but not succeeded. please some one help me with right approach.

Thanks

1

There are 1 best solutions below

0
Thomas On

After lots of efforts i could grab html from selection. so i like to share my code here which may help other in future.

using HtmlAgilityPack;
using mshtml;
using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Net;
using System.Windows.Forms;

namespace HtmlTableParser
{
    public partial class WebSiteParser : Form
    {
        public WebSiteParser()
        {
            InitializeComponent();
        }

        private void btnLoad_Click(object sender, EventArgs e)
        {
            webBrowser1.Navigate(txtAddress.Text);
        }

        private void btnProcess_Click(object sender, EventArgs e)
        {
            int rownumber = 0, itemrow = 0;
            string selectedHtml = "", strCellValues = "";
            System.Windows.Forms.HtmlDocument htmldocument = webBrowser1.Document;
            string strExtractedData = "";
            bool IdAttached=false;
            List<string> lines = null;

            if (htmldocument != null && (GetSelectedText() != "" && GetSelectedText() != null))
            {
                selectedHtml = GetSelectionHtml(htmldocument);

                var doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(selectedHtml);
                foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table//tbody"))
                {
                    rownumber = 0;
                    itemrow = 0;

                    //iterate in table row tr
                    foreach (HtmlNode row in table.SelectNodes("tr"))
                    {
                        if (itemrow > 0)
                        {
                            if (strExtractedData != "")
                            {
                                strExtractedData += System.Environment.NewLine;
                                IdAttached = false;
                            }

                            //iterate in table td  
                            foreach (HtmlNode cell in row.SelectNodes("th|td"))
                            {
                                if (cell.OriginalName != "th")
                                {
                                    strCellValues = WebUtility.HtmlDecode(cell.InnerText);
                                    strCellValues = strCellValues.Trim().Replace("\r","").Replace("\n", "").Replace("$", "");

                                    if (strCellValues != "")
                                    {
                                        if (rownumber == 0)
                                            rownumber = 1;

                                        if (!IdAttached)
                                        {
                                            strExtractedData += rownumber.ToString();
                                            IdAttached=true;
                                        }
                                    }
                                    
                                    if (rownumber > 0)
                                    {
                                        if (strCellValues != "" && strCellValues != "&nbsp;")
                                        {
                                            strExtractedData += "|" + strCellValues;
                                        }
                                    }
                                }
                            }
                        }
                        itemrow++;
                        if (rownumber > 0)
                        {
                            rownumber++;
                        }
                    }
                }
            }

            if (strExtractedData != "")
            {
                lines = strExtractedData.Split(new string[] { Environment.NewLine }, StringSplitOptions.None).ToList();
                lines = lines.Where(a => a != "").ToList();

                var maximum = lines.Select(x => x.Count(ee => ee == '|')).DefaultIfEmpty().Max();
            }
        }

        private string GetSelectionHtml(System.Windows.Forms.HtmlDocument document)
        {
            // Check if there is a selected text range
            if (document != null && document.Body != null && document.Body.Document != null)
            {
                IHTMLDocument2 doc2 = (IHTMLDocument2)document.DomDocument;
                IHTMLSelectionObject currentSelection = doc2.selection;

                // Check if the selection is of type TextRange
                if (currentSelection != null && currentSelection.createRange() is IHTMLTxtRange textRange)
                {
                    // Get the HTML content of the selected text range
                    return textRange.htmlText;
                }
            }

            return null;
        }

        private string GetSelectedText()
        {
            string selectedtext = "";
            try
            {
                dynamic document = webBrowser1.Document.DomDocument;
                dynamic selection = document.selection;
                dynamic text = selection.createRange().text;
                selectedtext=(string)text;
            }
            catch(Exception ex)
            {
                selectedtext = "";
            }
            return selectedtext;
        }

        
    }
}

Thanks