| | 1 | | namespace Songhay.Xml; |
| | 2 | |
|
| | 3 | | /// <summary> |
| | 4 | | /// Static members for HTML text processing. |
| | 5 | | /// </summary> |
| | 6 | | public static class HtmlUtility |
| | 7 | | { |
| | 8 | | /// <summary> |
| | 9 | | /// Returns a string of marked up text compatible |
| | 10 | | /// with browsers that do not support XHTML |
| | 11 | | /// (loosely towards HTML 4.x W3C standard). |
| | 12 | | /// </summary> |
| | 13 | | /// <param name="input">A <see cref="string"/> of markup.</param> |
| | 14 | | public static string? ConvertToHtml(string? input) |
| 0 | 15 | | { |
| 0 | 16 | | if (string.IsNullOrWhiteSpace(input)) return null; |
| | 17 | |
|
| | 18 | | //Minimize selected XHTML block elements. |
| 0 | 19 | | input |
| 0 | 20 | | = Regex.Replace(input, @"</(base|isindex|link|meta)>", |
| 0 | 21 | | string.Empty, RegexOptions.IgnoreCase); |
| | 22 | |
|
| | 23 | | //Remove XHTML html element attributes. |
| 0 | 24 | | input |
| 0 | 25 | | = Regex.Replace(input, @"<html*>", |
| 0 | 26 | | "<html>", RegexOptions.IgnoreCase); |
| | 27 | |
|
| | 28 | | //Remove XHTML element minimization. |
| 0 | 29 | | input = Regex.Replace(input, @"\s*/>", ">"); |
| | 30 | |
|
| | 31 | | //Remove XHTML attribute minimization. |
| 0 | 32 | | foreach (Match mTag in Regex.Matches(input, @"<[^/][^>]*>")) |
| 0 | 33 | | { |
| | 34 | | //An opening input element has been found. |
| 0 | 35 | | string strReplace = mTag.Value; |
| 0 | 36 | | foreach (Match mAttr in Regex.Matches(strReplace, @"\s+(.+)\s*=\s*""\1""")) |
| 0 | 37 | | { |
| | 38 | | //XHTML minimization found (e.g. foo="foo"). |
| 0 | 39 | | strReplace |
| 0 | 40 | | = strReplace.Replace(mAttr.Value, |
| 0 | 41 | | string.Concat(" ", mAttr.Groups[1].Value)); |
| 0 | 42 | | } |
| | 43 | |
|
| 0 | 44 | | input = input.Replace(mTag.Value, strReplace); |
| 0 | 45 | | } |
| | 46 | |
|
| 0 | 47 | | return input; |
| 0 | 48 | | } |
| | 49 | |
|
| | 50 | | /// <summary> |
| | 51 | | /// Attempts to convert HTML to well-formed XML. |
| | 52 | | /// </summary> |
| | 53 | | /// <param name="html">An HTML <see cref="string"/>.</param> |
| | 54 | | /// <remarks>This task is simpler than converting to XHTML.</remarks> |
| | 55 | | public static string? ConvertToXml(string? html) |
| 0 | 56 | | { |
| 0 | 57 | | if (string.IsNullOrWhiteSpace(html)) return null; |
| | 58 | |
|
| | 59 | | Regex re; |
| | 60 | | MatchEvaluator me; |
| | 61 | |
|
| | 62 | | //Remove xmlns attributes: |
| 0 | 63 | | html = Regex.Replace(html, @"\s*xmlns\s*=\s*""[^""]+""\s*", string.Empty); |
| | 64 | |
|
| | 65 | | //Close open elements: |
| 0 | 66 | | me = EvaluateOpenElement; |
| | 67 | |
|
| 0 | 68 | | re = new Regex(@"<\s*(br|hr|img|link|meta)([^>]*)(>)", RegexOptions.IgnoreCase); |
| 0 | 69 | | html = re.Replace(html, me); |
| | 70 | |
|
| | 71 | | //Find attribute minimization: |
| 0 | 72 | | me = EvaluateElementForMinimizedAttribute; |
| | 73 | |
|
| 0 | 74 | | re = new Regex(@"<[^>]+>", RegexOptions.IgnoreCase); |
| 0 | 75 | | html = re.Replace(html, me); |
| | 76 | |
|
| | 77 | | //Find attributes without quotes: |
| 0 | 78 | | me = EvaluateElementForMalformedAttribute; |
| | 79 | |
|
| 0 | 80 | | re = new Regex(@"<[^>]+>", RegexOptions.IgnoreCase); |
| 0 | 81 | | html = re.Replace(html, me); |
| | 82 | |
|
| | 83 | | //Generate attributes: |
| 0 | 84 | | me = EvaluateAttribute; |
| | 85 | |
|
| 0 | 86 | | re = new Regex(@"<\s*[^>]+\s(checked|nobreak|nosave|selected)[^=>]*\/*>", RegexOptions.IgnoreCase); |
| 0 | 87 | | html = re.Replace(html, me); |
| | 88 | |
|
| | 89 | | //Look for Query strings with raw ampersands: |
| 0 | 90 | | foreach (Match m in Regex.Matches(html, @"href\s*=\s*""[^""]+""")) |
| 0 | 91 | | { |
| 0 | 92 | | if (!m.Value.Contains("&")) html = html.Replace(m.Value, m.Value.Replace("&", "&")); |
| 0 | 93 | | } |
| | 94 | |
|
| | 95 | | //Replace the CDATA "xmlns" with "x…mlns" (adds a soft-hyphen): |
| 0 | 96 | | html = html.Replace("xmlns", "x…mlns"); |
| | 97 | |
|
| 0 | 98 | | return html; |
| 0 | 99 | | } |
| | 100 | |
|
| | 101 | | /// <summary> |
| | 102 | | /// Returns an XHTML string derived from a .NET procedure. |
| | 103 | | /// </summary> |
| | 104 | | /// <param name="xmlFragment"> |
| | 105 | | /// A well-formed <see cref="string"/> of XML. |
| | 106 | | /// </param> |
| | 107 | | /// <remarks> |
| | 108 | | /// This member addresses certain quirks |
| | 109 | | /// that well-formed XML cannot have in a contemporary Web browser. |
| | 110 | | /// </remarks> |
| | 111 | | public static string? FormatXhtmlElements(string? xmlFragment) |
| 0 | 112 | | { |
| 0 | 113 | | if (string.IsNullOrWhiteSpace(xmlFragment)) return null; |
| | 114 | |
|
| | 115 | | //Maximize selected empty minimized block elements. |
| 0 | 116 | | foreach (Match m in Regex.Matches(xmlFragment, @"<(a|iframe|td|th|script)\s+[^>]*\s*(\/>)", |
| 0 | 117 | | RegexOptions.IgnoreCase)) |
| 0 | 118 | | { |
| 0 | 119 | | if (m.Groups.Count == 2) |
| 0 | 120 | | { |
| 0 | 121 | | var newValue = m.Value.Replace(m.Groups[1].Value, |
| 0 | 122 | | string.Concat("></", m.Groups[0].Value, ">")); |
| 0 | 123 | | xmlFragment = xmlFragment.Replace(m.Value, newValue); |
| 0 | 124 | | } |
| 0 | 125 | | } |
| | 126 | |
|
| 0 | 127 | | return xmlFragment; |
| 0 | 128 | | } |
| | 129 | |
|
| | 130 | | /// <summary> |
| | 131 | | /// Returns the …inner… fragment of XML |
| | 132 | | /// from the specified unique element. |
| | 133 | | /// </summary> |
| | 134 | | /// <param name="xmlFragment"> |
| | 135 | | /// A well-formed <see cref="string"/> of XML. |
| | 136 | | /// </param> |
| | 137 | | /// <param name="elementName"> |
| | 138 | | /// The local name of the element in the XML string. |
| | 139 | | /// </param> |
| | 140 | | public static string? GetInnerXml(string? xmlFragment, string? elementName) |
| 0 | 141 | | { |
| 0 | 142 | | if (string.IsNullOrWhiteSpace(xmlFragment)) return null; |
| | 143 | |
|
| 0 | 144 | | string ret = xmlFragment; |
| | 145 | |
|
| 0 | 146 | | string pattern = string.Format(CultureInfo.InvariantCulture, @"<{0}[^>]*>((\s*.+\s*)+)<\/{0}>", elementName); |
| 0 | 147 | | foreach (Match m in Regex.Matches(ret, pattern, RegexOptions.IgnoreCase)) |
| 0 | 148 | | { |
| 0 | 149 | | if (m.Groups.Count > 1) ret = m.Groups[1].Value; |
| 0 | 150 | | break; |
| | 151 | | } |
| | 152 | |
|
| | 153 | | //Remove first four spaces at start of line. |
| 0 | 154 | | ret = Regex.Replace(ret, @"\r\n\W{4}", "\r\n"); |
| | 155 | |
|
| 0 | 156 | | return ret; |
| 0 | 157 | | } |
| | 158 | |
|
| | 159 | | /// <summary> |
| | 160 | | /// Emits a public <c>DOCTYPE</c> tag. |
| | 161 | | /// </summary> |
| | 162 | | /// <param name="rootElement"> |
| | 163 | | /// The root element of the DTD. |
| | 164 | | /// </param> |
| | 165 | | /// <param name="publicIdentifier"> |
| | 166 | | /// The public identifier of the DTD. |
| | 167 | | /// </param> |
| | 168 | | /// <param name="resourceReference"> |
| | 169 | | /// The link to reference material of the DTD. |
| | 170 | | /// </param> |
| | 171 | | /// <returns> |
| | 172 | | /// A public <c>DOCTYPE</c> tag. |
| | 173 | | /// </returns> |
| | 174 | | public static string PublicDocType(string? rootElement = "html", |
| | 175 | | string? publicIdentifier = "-//W3C//DTD XHTML 1.0 Transitional//EN", |
| | 176 | | string? resourceReference = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd") => |
| 0 | 177 | | string.Format(CultureInfo.InvariantCulture, "<!DOCTYPE {0} PUBLIC \"{1}\" \"{2}\">", |
| 0 | 178 | | rootElement, publicIdentifier, resourceReference); |
| | 179 | |
|
| | 180 | | #region Regular Expression Match Evaluators |
| | 181 | |
|
| | 182 | | static string EvaluateAttribute(Match match) |
| 0 | 183 | | { |
| 0 | 184 | | var s = match.Value; |
| 0 | 185 | | if (match.Groups.Count != 2) return s; |
| | 186 | |
|
| 0 | 187 | | var group1Value = match.Groups[1].Value; |
| 0 | 188 | | s = match.Groups[0].Value.Replace(group1Value, |
| 0 | 189 | | string.Format(CultureInfo.InvariantCulture, @"{0}=""{0}""", group1Value)); |
| | 190 | |
|
| 0 | 191 | | return s; |
| 0 | 192 | | } |
| | 193 | |
|
| | 194 | | static string EvaluateElementForMalformedAttribute(Match match) |
| 0 | 195 | | { |
| 0 | 196 | | var s = match.Value; |
| 0 | 197 | | var re = new Regex(@"([^\""\s]+)(\s*=\s*)([^\""\s]+)\s", RegexOptions.IgnoreCase); |
| 0 | 198 | | var me = new MatchEvaluator(EvaluateMalformedAttribute); |
| 0 | 199 | | return re.Replace(s, me); |
| 0 | 200 | | } |
| | 201 | |
|
| | 202 | | static string EvaluateElementForMinimizedAttribute(Match match) |
| 0 | 203 | | { |
| 0 | 204 | | var s = match.Value; |
| | 205 | |
|
| 0 | 206 | | var re = new Regex(@"\<\s*/"); |
| 0 | 207 | | if (re.IsMatch(s)) return s; //ignore closing element |
| | 208 | |
|
| 0 | 209 | | var placeholderPrefix = "!*m"; |
| 0 | 210 | | var placeholderTemplate = string.Concat(placeholderPrefix, "{0}"); |
| | 211 | |
|
| | 212 | | //remove strings between quotes: |
| 0 | 213 | | var betweenQuotes = Regex.Matches(s, @"([""'])(?:(?=(\\?))\2.)*?\1", RegexOptions.IgnoreCase); |
| 0 | 214 | | foreach (Match m in betweenQuotes) |
| 0 | 215 | | { |
| 0 | 216 | | var placeholder = string.Format(placeholderTemplate, m.Index); |
| 0 | 217 | | s = s.Replace(m.Value, string.Format(placeholder, m.Index)); |
| 0 | 218 | | } |
| | 219 | |
|
| | 220 | | //evaluate what was not removed: |
| 0 | 221 | | var possibilities = Regex.Matches(s, @"(\b[^\s]+\b)", RegexOptions.IgnoreCase); |
| 0 | 222 | | foreach (Match m in possibilities) |
| 0 | 223 | | { |
| 0 | 224 | | if (m.Index == 1) continue; //match should not be element name |
| 0 | 225 | | if (m.Value.Contains('=')) continue; //match should not be attribute-value pair |
| 0 | 226 | | s = s.Replace(m.Value, string.Format(@"{0}=""{0}""", m.Value)); |
| 0 | 227 | | } |
| | 228 | |
|
| | 229 | | //restore strings between quotes: |
| 0 | 230 | | foreach (Match m in betweenQuotes) |
| 0 | 231 | | { |
| 0 | 232 | | var reArg = string.Concat(Regex.Escape(placeholderPrefix), m.Index, @"\b"); |
| 0 | 233 | | re = new Regex(reArg); |
| 0 | 234 | | s = re.Replace(s, m.Value, 1); |
| 0 | 235 | | } |
| | 236 | |
|
| 0 | 237 | | return s; |
| 0 | 238 | | } |
| | 239 | |
|
| | 240 | | static string EvaluateMalformedAttribute(Match match) |
| 0 | 241 | | { |
| 0 | 242 | | var s = match.Value; |
| 0 | 243 | | if (match.Groups.Count != 4) return s; |
| 0 | 244 | | if (s.Contains('\'')) return s; |
| | 245 | |
|
| 0 | 246 | | return s.Contains('"') |
| 0 | 247 | | ? s |
| 0 | 248 | | : $@" {match.Groups[1].Value.Trim()}{match.Groups[2].Value.Trim()}""{match.Groups[3].Value.Trim()}"" "; |
| 0 | 249 | | } |
| | 250 | |
|
| | 251 | | static string EvaluateOpenElement(Match match) |
| 0 | 252 | | { |
| 0 | 253 | | var s = match.Value; |
| 0 | 254 | | if (match.Groups.Count != 4) return s; |
| | 255 | |
|
| | 256 | | //Refuse closed elements: |
| 0 | 257 | | if (match.Groups[2].Value.Trim().EndsWith("/", StringComparison.OrdinalIgnoreCase)) return s; |
| | 258 | |
|
| 0 | 259 | | string oldValue = match.Groups[3].Value; |
| | 260 | |
|
| 0 | 261 | | if (oldValue.IndexOf(">", StringComparison.OrdinalIgnoreCase) != -1) s = s.Replace(oldValue, " />"); |
| | 262 | |
|
| 0 | 263 | | return s; |
| 0 | 264 | | } |
| | 265 | |
|
| | 266 | | #endregion |
| | 267 | | } |