Converting from Word to HTML/XHTML

Wednesday, September 8, 2010

The below C# .net code helps you to clean you html which you have copied from MS word. This is very helpful when you convert word document to pure HTML or XHTML.

static string CleanHtml(string html) 
{
 StringCollection sc = new StringCollection();
 html = Regex.Replace(html, @"<(o:p|\/o:p)>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled|RegexOptions.Compiled);
 html = Regex.Replace(html, @"\s*mso-[^:]+:[^;\x22]+;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"\s*MARGIN-\w{3,6}:\s*0\w{2}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"\s*MARGIN\s*:(\s*0\w{2}){3}\s*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"\s*TEXT-INDENT: 0.{2}\s*;", "", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
html = html.Replace("<br>", "<br/>"); html = html.Replace(" ", "\t"); html = html.Replace("align=\"middle\"", "align=\"center\""); 
Application.DoEvents(); html = Regex.Replace(html, @"<\\?\?xml[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<\/?\w+:[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 //html = Regex.Replace(html, @"<!(--)?[^>]*>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"x:\w{3}(=\x22[\s\w]*\x22)", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 Application.DoEvents(); html = Regex.Replace(html, @"\s*tab-stops:[^;""]*;?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 string str, str2 = string.Empty;
 bool Quotes = Regex.Match(html, @"<(\w[^>]*) (class=)([^ |>]*)([^>]*)", RegexOptions.IgnoreCase|RegexOptions.Compiled).Value.Contains("\"");
 MatchCollection col_imgsrc = Regex.Matches(html, @"<img [^>]*src=[\x22|\']([^\x22|\']+)", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
MatchCollection col_imgalt = Regex.Matches(html, @"<img [^>]*alt=[\x22|\']([^\x22|\']+)", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 int i=0;
 if ((col_imgsrc.Count >0) && (col_imgalt.Count >0))
 {
 foreach (Match m in col_imgsrc)
 {
 if (!m.Groups[1].Value.ToLower().Contains("http"))
 {
 html = html.Replace(m.Groups[1].Value, col_imgalt[i].Groups[1].Value); } i++; } }
 html = Regex.Replace(html, @"(<style>)\s*\r*<!(--)?", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"-->\s*\r*(</style>)", "$1", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
html = Regex.Replace(html, @"(<\w[^>]*style=)(')([^ |>]*)'", "$1$3$5$7", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11$13$15", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)(,)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3$5$7$9$11", RegexOptions.IgnoreCase|RegexOptions.Compiled);
html = Regex.Replace(html, @"(font-family:)\s*\x22([\w\s-]*)\s*\x22(,)\s*\x22([\w\s-]*)\s*\x22", "$1$2$3$4", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
html = Regex.Replace(html, @"(font-family:)\s*(\x22)([\w\s-]*)\s*(\x22)", "$1$3", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 Application.DoEvents(); 
html = html.Replace("·", "\xB7");
 if (Quotes)
 {
 html = Regex.Replace(html, @"(<IMG\s*[^>]*)([^/]>)\s*(</IMG>)?", "$1\"/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 }
 else
 { 
html = Regex.Replace(html, @"(<IMG\s*[^>]*)([^/]>)\s*(</IMG>)?", "$1/>", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 } 
html = Regex.Replace(html, @"(<meta\s*[^>]*)([^/]>)\s*(</meta>)?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"Style\s*=\x22?\s*mso-?\w*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<font\s*[^>]*>\s*</font>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<span\s*[^>]*>\s*</span>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<td\s*[^>]*>\s*</td>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<i\s*[^>]*>\s*</i>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<b\s*[^>]*>\s*</b>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<span\s*[^>]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"<font\s*[^>]*/>", "", RegexOptions.IgnoreCase|RegexOptions.Compiled); 
html = Regex.Replace(html, @"(style\s*=\s*\x22[^\x22]*)(\s*\x22\s*Style\s*=\s*\x22)", "$1;", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"Class\s*=\x22?\s*\d*-\d*\s*\x22?", "", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = Regex.Replace(html, @"(<span\s*>)([\s\w]*)(</span>)", "$2", RegexOptions.IgnoreCase|RegexOptions.Compiled);
 html = html.Replace("®", "®"); html = html.Replace("©", "©"); html = html.Replace("™", "™");
 return html;
 }

Was this post helpful to you ? !!.. Then please like this in any of the social media below !.
Let it be useful for others too..

kick it on DotNetKicks.com



Also Read >>> Which Webapplication is running under which applicationpool and worker process To Receive full blog post and updates to your inbox. Click Here and enter your email address

blog comments powered by Disqus
Click here and Follow us on Twitter