C# How to extract html table from selected text of web browser control

56 Views Asked by At

I have a web browser control where a web site will load. web page has many tabular data which user will select and need to parse those data and show in datagridview.

This is way i am extracting selected text from web browser control.

private string GetSelectedText()
        {
            dynamic document = webBrowser1.Document.DomDocument;
            dynamic selection = document.selection;
            dynamic text = selection.createRange().text;
            return (string)text;
        }

Now it is getting very hard to extract data properly from selected text. So my question is is it possible to get html data from selected text?

these are my site from where i need to parse data. https://www.sec.gov/Archives/edgar/data/1108134/000110813423000018/bhlb-20230630.htm https://www.sec.gov/Archives/edgar/data/66740/000006674023000058/mmm-20230630.htm

this is my current routine which i am using to parse selected data but not very good way i am following.

public string SelectedText { get; set; }
private void Form2_Load(object sender, EventArgs e)
{
    bool startparse = false;
    int colCounter = 1;
    DataTable dt = new DataTable();
    string selectedtext = SelectedText;
    string[] lines = null;
    List<string> colvalues = null;
    //list of char need to replace from selected line item name
    // storing new lineitem & carriage return
    string[] stringSeparators = new string[] { "\r\n" };
    char[] patternone = new char[] { '%', '€', ';', ',', '.', '$', '£', '(', ')' };

    #region Data parsing logic from browser & storing into datatable
    //splitting selected text
    lines = selectedtext.Split(stringSeparators, StringSplitOptions.None);

    List<string> columns = null;
    string strLeftColumnName = "";
    string tmp = "";
    string lineitem = "", strValues = "", strTmpdata, strNewValues = "";

    #region Extract data for each rows
    foreach (string s in lines)
    {
        columns = null;
        tmp = "";
        lineitem = "";
        strValues = "";
        strTmpdata = "";
        strNewValues = "";

        #region Extract data for building columns
        foreach (string line in lines)
        {
            tmp = line;
            //if (line.Contains("Dollars in millions"))
            //{
                var match = Regex.Match(line, "\\(\\D*\\)", RegexOptions.IgnoreCase);
                if (match.Success)
                {
                    strLeftColumnName = match.Groups[0].Value;
                }
                tmp = tmp.Trim().Replace(strLeftColumnName.Trim(), "");
                columns = tmp.Trim().Split(new char[] { ' ' }).ToList();
                columns.Insert(0, strLeftColumnName);
                break;
            //}
        }
        #endregion

        #region Build Datagrid columns
        if (columns != null && columns.Count > 0)
        {
            if (dgv.Columns.Count < columns.Count)
            {
                foreach (string col in columns)
                {
                    if (col.All(char.IsNumber))
                    {
                        dgv.Columns.Add("col_" + colCounter, "");
                        dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
                    }
                    else
                    {
                        dgv.Columns.Add("col_" + colCounter, "");
                        dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
                    }
                    colCounter++;
                }
            }
        }
        #endregion

        if (s != "" && (s.Contains("Dollars in millions") || startparse))
        {

            if (s.Contains("Net changes related to available-for-sale securities"))
            {

            }

            strTmpdata = s;

            //Here storing lineitem name
            lineitem = Regex.Replace(s.Trim(), @"[\d-1]", string.Empty);
            //lineitem = Regex.Replace(s.Trim(), @"[^A-Za-z0-9 -]", string.Empty);
            lineitem = ReplaceMultipleChar(lineitem, patternone, string.Empty);
            lineitem = lineitem.Trim();

            if (lineitem != "")
            {
                //here split numeric data only
                if (strTmpdata.Length > lineitem.Length)
                {
                    //strValues = strTmpdata.Substring(lineitem.Length, (strTmpdata.Length - lineitem.Length));
                    //lineitem = Regex.Escape(lineitem);
                    //strTmpdata =Regex.Escape(strTmpdata);
                    //strTmpdata = Regex.Replace(strTmpdata, lineitem, "");

                    strTmpdata = GetNumericData(strTmpdata);
                    //strValues = ReplaceWholeWord( strTmpdata, lineitem,"");
                    strValues = strTmpdata.Trim();
                    strValues = strValues.Replace("(", "-").Replace(")", " ").Replace(",", "").Trim();
                    //strNewValues = strValues;
                    //for (int i = 0; i < strValues.Length; i++)
                    //{
                    //    if (Char.IsDigit(strValues[i]) || strValues[i] == '-' || strValues[i] == ' ' || strValues[i] == '.')
                    //        strNewValues += strValues[i];
                    //}
                }

                //strValues = strNewValues.Trim();
                colvalues = strValues.Trim().Split(new char[] { ' ' }).ToList();
                if (colvalues.Count > 0)
                {
                    colvalues.Insert(0, lineitem);
                    dgv.Rows.Add(colvalues.ToArray());
                }
            }
            startparse = true;
        }
    }
    #endregion

    #endregion
}
private string GetNumericData(string input)
{
    string output = "";
    for (int i = 0; i < input.Length; i++)
    {
        if (input[i] == '3')
        {

        }
        if (input[i] == '.' || input[i] == ' ' || input[i] == '-' || input[i] == '(' || input[i] == ')' || Char.IsDigit(input[i]))
        {
            if (input[i] == '(' && Char.IsDigit(input[i + 1]) && (i + 1) < input.Length)
            {
                output += input[i];
            }
            else if (input[i] == ')' && Char.IsDigit(input[i - 1]) && i > 0)
            {
                output += input[i];
            }
            else if (input[i] == '.' || input[i] == '-' || Char.IsDigit(input[i]) || input[i] == ' ')
            {
                output += input[i];
            }
        }
    }
    return output;
}
public string ReplaceMultipleChar(string s, char[] separators, string newVal)
{
    string[] temp;

    temp = s.Split(separators, StringSplitOptions.RemoveEmptyEntries);
    return String.Join(newVal, temp);

}

public string ReplaceAll(string s, string separators, string newVal)
{
    return Regex.Replace(s, separators, newVal);
}

public string ReplaceWholeWord(string original, string wordToFind, string replacement, RegexOptions regexOptions = RegexOptions.None)
{
    string pattern = String.Format(@"\b{0}\b", wordToFind);
    string ret = Regex.Replace(original, pattern, replacement, regexOptions);
    return ret;
}

Please some one help me that how to get html of selected text from web browser control or discuss any other good approach to parse selected tabular data which i need to show in datagridview.

Thanks

1

There are 1 best solutions below

0
Puygrenier Solann On

That's web scraping. Some existing tools are already made for that.

Nevertheless you can extract from the console of the web browser using js + Xpath directly.

See extract of relative code I already use for that with C# + xpath :

public class MyData {
  public string data_1 {get;set;}
  public string data_2 {get;set;}
}
//...
string url = "your url";
var client = new RestClient(url);
var request = new RestRequest("", Method.Get);
request.AddHeader("",""); //if needed but for your web site there is a "acceptable policy of automated tools" to set.

var res = client.Execute(request);
if (res.IsSuccessStatusCode is not true) throw new ArgumentException();

HtmlDocument xdc = new HtmlDocument();
string sanitazed = Regex.Replace(res.Content, "&nbsp", "");
xdc.LoadHtml(sanitazed);

string mydata1 = xdc.DocumentNode.SelectNodes("you xpath 1")?.First() ?.InnerText ?? "";
string mydata2 = xdc.DocumentNode.SelectNodes("you xpath 2")?.First() ?.InnerText ?? "";
MyData result = new() { data_1 : mydata1, data_1 : mydata2 };
 
// or alternative for loop, 

var nodes = xdc.DocumentNode.SelectNodes("xpath that look like //table//tbody//tr/td[5]/a");
if (nodes is null) throw new Exception(" xpath on error, please check");
foreach (var node in nodes) {
  string mydata1 = xdc.DocumentNode.SelectNodes("you xpath 1")?.First() ?.InnerText ?? "";
string mydata2 = xdc.DocumentNode.SelectNodes("you xpath 2")?.First() ?.InnerText ?? "";
  // or with attribut
  var data1 = node.GetAttributeValue("href","?");
  var data2 = node.GetAttributeValue("href","?");
  
}