The below C# .net code helps you to clean you html which you have copied from MS word. This is very helpful when you convert word document to pure HTML or XHTML.
static string CleanHtml(string html)
{
StringCollection sc = new StringCollection();
html = Regex.Replace(html, @"<(o:p|\/o:p)>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*mso-[^:]+:[^;\x22]+;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*MARGIN-\w{3,6}:\s*0\w{2}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*MARGIN\s*:(\s*0\w{2}){3}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"\s*TEXT-INDENT: 0.{2}\s*;", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = html.Replace("<br>", "<br/>"); html = html.Replace(" ", "\t"); html = html.Replace("align=\"middle\"", "align=\"center\"");
Application.DoEvents(); html = Regex.Replace(html, @"<\\?\?xml[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<\/?\w+:[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
//html = Regex.Replace(html, @"<!(--)?[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"x:\w{3}(=\x22[\s\w]*\x22)", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
Application.DoEvents(); html = Regex.Replace(html, @"\s*tab-stops:[^;""]*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
string str, str2 = string.Empty;
bool Quotes = Regex.Match(html, @"<(\w[^>]*) (class=)([^ |>]*)([^>]*)", RegexOptions.IgnoreCase|RegexOptions.Compiled).Value.Contains("\"");
MatchCollection col_imgsrc = Regex.Matches(html, @"<img [^>]*src=[\x22|\']([^\x22|\']+)", RegexOptions.IgnoreCase|RegexOptions.Compiled);
MatchCollection col_imgalt = Regex.Matches(html, @"<img [^>]*alt=[\x22|\']([^\x22|\']+)", RegexOptions.IgnoreCase|RegexOptions.Compiled);
int i=0;
if ((col_imgsrc.Count >0) && (col_imgalt.Count >0))
{
foreach (Match m in col_imgsrc)
{
if (!m.Groups[1].Value.ToLower().Contains("http"))
{
html = html.Replace(m.Groups[1].Value, col_imgalt[i].Groups[1].Value); } i++; } }
html = Regex.Replace(html, @"(<style>)\s*\r*<!(--)?", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"-->\s*\r*(</style>)", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(<\w[^>]*style=)(')([^ |>]*)'", "$1$3$5$7", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11$13$15", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*\x22([\w\s-]*)\s*\x22(,)\s*\x22([\w\s-]*)\s*\x22", "$1$2$3$4", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3", RegexOptions.IgnoreCase|RegexOptions.Compiled);
Application.DoEvents();
html = html.Replace("·", "\xB7");
if (Quotes)
{
html = Regex.Replace(html, @"(<IMG\s*[^>]*)([^/]>)\s*(</IMG>)?", "$1\"/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
}
else
{
html = Regex.Replace(html, @"(<IMG\s*[^>]*)([^/]>)\s*(</IMG>)?", "$1/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
}
html = Regex.Replace(html, @"(<meta\s*[^>]*)([^/]>)\s*(</meta>)?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"Style\s*=\x22?\s*mso-?\w*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<font\s*[^>]*>\s*</font>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<td\s*[^>]*>\s*</td>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<i\s*[^>]*>\s*</i>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<b\s*[^>]*>\s*</b>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<span\s*[^>]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"<font\s*[^>]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(style\s*=\s*\x22[^\x22]*)(\s*\x22\s*Style\s*=\s*\x22)", "$1;", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"Class\s*=\x22?\s*\d*-\d*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(<span\s*>)([\s\w]*)(</span>)", "$2", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = html.Replace("®", "®"); html = html.Replace("©", "©"); html = html.Replace("™", "™");
return html;
}
Was this post helpful to you ? !!.. Then please like this in any of the social media below !.
Let it be useful for others too..