I have a web browser control where a web site will load. web page has many tabular data which user will select and need to parse those data and show in datagridview.
This is way i am extracting selected text from web browser control.
private string GetSelectedText()
{
dynamic document = webBrowser1.Document.DomDocument;
dynamic selection = document.selection;
dynamic text = selection.createRange().text;
return (string)text;
}
Now it is getting very hard to extract data properly from selected text. So my question is is it possible to get html data from selected text?
these are my site from where i need to parse data. https://www.sec.gov/Archives/edgar/data/1108134/000110813423000018/bhlb-20230630.htm https://www.sec.gov/Archives/edgar/data/66740/000006674023000058/mmm-20230630.htm
this is my current routine which i am using to parse selected data but not very good way i am following.
public string SelectedText { get; set; }
private void Form2_Load(object sender, EventArgs e)
{
bool startparse = false;
int colCounter = 1;
DataTable dt = new DataTable();
string selectedtext = SelectedText;
string[] lines = null;
List<string> colvalues = null;
//list of char need to replace from selected line item name
// storing new lineitem & carriage return
string[] stringSeparators = new string[] { "\r\n" };
char[] patternone = new char[] { '%', '€', ';', ',', '.', '$', '£', '(', ')' };
#region Data parsing logic from browser & storing into datatable
//splitting selected text
lines = selectedtext.Split(stringSeparators, StringSplitOptions.None);
List<string> columns = null;
string strLeftColumnName = "";
string tmp = "";
string lineitem = "", strValues = "", strTmpdata, strNewValues = "";
#region Extract data for each rows
foreach (string s in lines)
{
columns = null;
tmp = "";
lineitem = "";
strValues = "";
strTmpdata = "";
strNewValues = "";
#region Extract data for building columns
foreach (string line in lines)
{
tmp = line;
//if (line.Contains("Dollars in millions"))
//{
var match = Regex.Match(line, "\\(\\D*\\)", RegexOptions.IgnoreCase);
if (match.Success)
{
strLeftColumnName = match.Groups[0].Value;
}
tmp = tmp.Trim().Replace(strLeftColumnName.Trim(), "");
columns = tmp.Trim().Split(new char[] { ' ' }).ToList();
columns.Insert(0, strLeftColumnName);
break;
//}
}
#endregion
#region Build Datagrid columns
if (columns != null && columns.Count > 0)
{
if (dgv.Columns.Count < columns.Count)
{
foreach (string col in columns)
{
if (col.All(char.IsNumber))
{
dgv.Columns.Add("col_" + colCounter, "");
dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
}
else
{
dgv.Columns.Add("col_" + colCounter, "");
dgv.Columns["col_" + colCounter].SortMode = DataGridViewColumnSortMode.NotSortable;
}
colCounter++;
}
}
}
#endregion
if (s != "" && (s.Contains("Dollars in millions") || startparse))
{
if (s.Contains("Net changes related to available-for-sale securities"))
{
}
strTmpdata = s;
//Here storing lineitem name
lineitem = Regex.Replace(s.Trim(), @"[\d-1]", string.Empty);
//lineitem = Regex.Replace(s.Trim(), @"[^A-Za-z0-9 -]", string.Empty);
lineitem = ReplaceMultipleChar(lineitem, patternone, string.Empty);
lineitem = lineitem.Trim();
if (lineitem != "")
{
//here split numeric data only
if (strTmpdata.Length > lineitem.Length)
{
//strValues = strTmpdata.Substring(lineitem.Length, (strTmpdata.Length - lineitem.Length));
//lineitem = Regex.Escape(lineitem);
//strTmpdata =Regex.Escape(strTmpdata);
//strTmpdata = Regex.Replace(strTmpdata, lineitem, "");
strTmpdata = GetNumericData(strTmpdata);
//strValues = ReplaceWholeWord( strTmpdata, lineitem,"");
strValues = strTmpdata.Trim();
strValues = strValues.Replace("(", "-").Replace(")", " ").Replace(",", "").Trim();
//strNewValues = strValues;
//for (int i = 0; i < strValues.Length; i++)
//{
// if (Char.IsDigit(strValues[i]) || strValues[i] == '-' || strValues[i] == ' ' || strValues[i] == '.')
// strNewValues += strValues[i];
//}
}
//strValues = strNewValues.Trim();
colvalues = strValues.Trim().Split(new char[] { ' ' }).ToList();
if (colvalues.Count > 0)
{
colvalues.Insert(0, lineitem);
dgv.Rows.Add(colvalues.ToArray());
}
}
startparse = true;
}
}
#endregion
#endregion
}
private string GetNumericData(string input)
{
string output = "";
for (int i = 0; i < input.Length; i++)
{
if (input[i] == '3')
{
}
if (input[i] == '.' || input[i] == ' ' || input[i] == '-' || input[i] == '(' || input[i] == ')' || Char.IsDigit(input[i]))
{
if (input[i] == '(' && Char.IsDigit(input[i + 1]) && (i + 1) < input.Length)
{
output += input[i];
}
else if (input[i] == ')' && Char.IsDigit(input[i - 1]) && i > 0)
{
output += input[i];
}
else if (input[i] == '.' || input[i] == '-' || Char.IsDigit(input[i]) || input[i] == ' ')
{
output += input[i];
}
}
}
return output;
}
public string ReplaceMultipleChar(string s, char[] separators, string newVal)
{
string[] temp;
temp = s.Split(separators, StringSplitOptions.RemoveEmptyEntries);
return String.Join(newVal, temp);
}
public string ReplaceAll(string s, string separators, string newVal)
{
return Regex.Replace(s, separators, newVal);
}
public string ReplaceWholeWord(string original, string wordToFind, string replacement, RegexOptions regexOptions = RegexOptions.None)
{
string pattern = String.Format(@"\b{0}\b", wordToFind);
string ret = Regex.Replace(original, pattern, replacement, regexOptions);
return ret;
}
Please some one help me that how to get html of selected text from web browser control or discuss any other good approach to parse selected tabular data which i need to show in datagridview.
Thanks
That's web scraping. Some existing tools are already made for that.
Nevertheless you can extract from the console of the web browser using js + Xpath directly.
See extract of relative code I already use for that with C# + xpath :