SproutCMS

This is the code documentation for the SproutCMS project

source of /sprout/Helpers/RichTextSanitiser.php

  1. <?php
  2. /*
  3.  * Copyright (C) 2017 Karmabunny Pty Ltd.
  4.  *
  5.  * This file is a part of SproutCMS.
  6.  *
  7.  * SproutCMS is free software: you can redistribute it and/or modify it under the terms
  8.  * of the GNU General Public License as published by the Free Software Foundation, either
  9.  * version 2 of the License, or (at your option) any later version.
  10.  *
  11.  * For more information, visit <http://getsproutcms.com>.
  12.  */
  13.  
  14. namespace Sprout\Helpers;
  15.  
  16. use \DOMNode;
  17. use \DOMDocument;
  18.  
  19. /**
  20.  * Helper that strictly validates and sanitises user submitted HTML
  21.  * Intended for use with front-end instances of TinyMCE to ensure XSS is impossible.
  22.  *
  23.  * @note Operates in a whitelist mode; if a tag or attribute doesn't appear on the list it won't appear on the output.
  24.  */
  25. final class RichTextSanitiser
  26. {
  27. const ATTR_TYPE_TEXT = 0;
  28. const ATTR_TYPE_URL = 1;
  29. const ATTR_TYPE_CLASS = 2;
  30. const ATTR_TYPE_SRC = 3;
  31. const ATTR_TYPE_STYLE = 4;
  32.  
  33. private $dom_doc;
  34. private $errors = [];
  35. private $permitted_tags;
  36.  
  37. /**
  38.   * @var bool Permit only local (i.e. on the current server) resources in src attributes
  39.   */
  40. private $local_resources = false;
  41.  
  42. /**
  43.   * The default set of tags (and attributes) that are permitted
  44.   */
  45. public static $default_permitted_tags = [
  46. 'div' => [
  47. 'class' => self::ATTR_TYPE_CLASS,
  48. ],
  49. 'p' => [
  50. 'class' => self::ATTR_TYPE_CLASS,
  51. 'style' => self::ATTR_TYPE_STYLE
  52. ],
  53.  
  54. 'span' => [
  55. 'class' => self::ATTR_TYPE_CLASS,
  56. ],
  57. 'strong' => null,
  58. 'em' => null,
  59.  
  60. 'h1' => [
  61. 'class' => self::ATTR_TYPE_CLASS,
  62. ],
  63. 'h2' => [
  64. 'class' => self::ATTR_TYPE_CLASS,
  65. ],
  66. 'h3' => [
  67. 'class' => self::ATTR_TYPE_CLASS,
  68. ],
  69. 'h4' => [
  70. 'class' => self::ATTR_TYPE_CLASS,
  71. ],
  72. 'h5' => [
  73. 'class' => self::ATTR_TYPE_CLASS,
  74. ],
  75. 'h6' => [
  76. 'class' => self::ATTR_TYPE_CLASS,
  77. ],
  78. 'h7' => [
  79. 'class' => self::ATTR_TYPE_CLASS,
  80. ],
  81.  
  82. 'img' => [
  83. 'src' => self::ATTR_TYPE_SRC,
  84. 'alt' => self::ATTR_TYPE_TEXT
  85. ],
  86. 'a' => [
  87. 'class' => self::ATTR_TYPE_CLASS,
  88. 'href' => self::ATTR_TYPE_URL,
  89. 'title' => self::ATTR_TYPE_TEXT
  90. ],
  91.  
  92. 'ul' => [
  93. 'class' => self::ATTR_TYPE_CLASS
  94. ],
  95. 'ol' => [
  96. 'class' => self::ATTR_TYPE_CLASS
  97. ],
  98.  
  99. 'li' => [
  100. 'class' => self::ATTR_TYPE_CLASS
  101. ],
  102. ];
  103.  
  104. /**
  105.   * Construct the sanitiser given a string of HTML
  106.   *
  107.   * @param string $richtextData The HTML to sanitise
  108.   * @param array $permitted_tags An optional array of permitted tags to override the defaults
  109.   */
  110. public function __construct($richtextData, $permitted_tags = null)
  111. {
  112. // PHP-8+ deprecated this because it's disabled by default.
  113. if (PHP_VERSION_ID < 80000) {
  114. libxml_disable_entity_loader();
  115. }
  116.  
  117. $this->dom_doc = new DOMDocument();
  118. if (!@$this->dom_doc->loadHTML($richtextData, LIBXML_NOCDATA)) {
  119. $this->errors[] = 'There were errors in parsing the given HTML.';
  120. }
  121.  
  122. if ($permitted_tags) {
  123. $this->permitted_tags = $permitted_tags;
  124. } else {
  125. $this->permitted_tags = static::$default_permitted_tags;
  126. }
  127. }
  128.  
  129.  
  130. /**
  131.   * Gets a sanitised copy of the HTML
  132.   *
  133.   * @return string HTML with any elements or attributes not appearing on the whitelist removed
  134.   */
  135. public function sanitise()
  136. {
  137.  
  138. $this->sanitiseNode($this->dom_doc);
  139.  
  140. return ob_get_clean();
  141. }
  142.  
  143.  
  144. /**
  145.   * Checks whether any errors occurred during sanitising
  146.   *
  147.   * @return bool
  148.   */
  149. public function hasErrors()
  150. {
  151. return count($this->errors) > 0;
  152. }
  153.  
  154.  
  155. /**
  156.   * Get the list of errors produced during @see RichTextSanitiser::sanitise
  157.   *
  158.   * @return array An array of error messages. Obviously, you must HTML encode each entry for them to be safe
  159.   * for web use as they may contain values from the DOM.
  160.   */
  161. public function getErrors()
  162. {
  163. return $this->errors;
  164. }
  165.  
  166.  
  167. /**
  168.   * Set whether to only allow local resources to be referenced by automatically fetching attributes
  169.   * e.g. the src attribute on an <img> tag. Does not apply to the href attribute as that won't be
  170.   * automatically fetched by a browser.
  171.   *
  172.   * @param bool $local True if only local resources are allowed, false if any resource is permitted
  173.   * @return void
  174.   */
  175. public function setLocalResources($local)
  176. {
  177. $this->local_resources = (bool)$local;
  178. }
  179.  
  180.  
  181. /**
  182.   * Recursively sanitises a node and its children, echoing any output in order of appearance
  183.   *
  184.   * @param DOMNode $node The node to sanitise
  185.   */
  186. private function sanitiseNode(DOMNode $node)
  187. {
  188. switch ($node->nodeType) {
  189. case XML_ELEMENT_NODE:
  190. {
  191. // These get thrown in for free by DOMDocument::loadXML (thanks guys)
  192. // so just omit them and parse their children instead
  193. if ($node->nodeName === 'html' or $node->nodeName === 'body') {
  194. foreach ($node->childNodes as $child) {
  195. $this->sanitiseNode($child);
  196. }
  197.  
  198. return;
  199. }
  200.  
  201. if (!array_key_exists($node->nodeName, $this->permitted_tags)) {
  202. $this->errors[] = "Disallowed tag '{$node->nodeName}'";
  203.  
  204. return;
  205. }
  206.  
  207. $attributes = [];
  208. if ($node->hasAttributes()) {
  209. $allowed_attrs = $this->permitted_tags[$node->nodeName];
  210. if (!is_array($allowed_attrs)) {
  211. $this->errors[] = "'{$node->nodeName}' elements are not permitted to contain any attributes";
  212.  
  213. return;
  214. }
  215.  
  216. foreach ($node->attributes as $attr) {
  217. if (!array_key_exists($attr->name, $allowed_attrs)) {
  218. $this->errors[] = "Invalid attribute '{$attr->name}' for '{$node->nodeName}' element.";
  219.  
  220. return;
  221. }
  222.  
  223. $encoded = $this->encodeAttributeValue($allowed_attrs[$attr->name], $attr->value);
  224. if (!empty($encoded)) {
  225. $attributes[] = sprintf('%s="%s"', $attr->name, $encoded);
  226. }
  227. }
  228. }
  229.  
  230. echo "<{$node->nodeName}";
  231.  
  232. if (count($attributes)) {
  233. echo ' ', implode(' ', $attributes);
  234. }
  235.  
  236. if ($node->hasChildNodes()) {
  237. echo '>';
  238.  
  239. foreach ($node->childNodes as $child) {
  240. $this->sanitiseNode($child);
  241. }
  242.  
  243. echo "</{$node->nodeName}>";
  244. } else {
  245. echo ' />';
  246. }
  247.  
  248. break;
  249. }
  250.  
  251. case XML_TEXT_NODE:
  252. {
  253. echo htmlspecialchars($node->nodeValue, ENT_COMPAT | ENT_HTML401 | ENT_DISALLOWED, 'UTF-8', false);
  254. break;
  255. }
  256.  
  257. case XML_HTML_DOCUMENT_NODE:
  258. case XML_DOCUMENT_NODE:
  259. {
  260. foreach ($node->childNodes as $child) {
  261. $this->sanitiseNode($child);
  262. }
  263.  
  264. break;
  265. }
  266.  
  267. default:
  268. break;
  269. }
  270. }
  271.  
  272. /**
  273.   * Encodes the value of an attribute using the correct encoding based on type
  274.   *
  275.   * @param int $type The attribute type code, e.g. ATTR_TYPE_URL
  276.   * @param string $value The attribute value as seen in the DOM
  277.   * @return string|bool A safely encoded attribute value or false if the value fails the checks
  278.   */
  279. private function encodeAttributeValue($type, $value)
  280. {
  281. $value = trim($value);
  282.  
  283. switch ($type) {
  284. case self::ATTR_TYPE_URL:
  285. {
  286. $url = parse_url($value);
  287.  
  288. if ($url == false or strcasecmp($url['scheme'], 'javascript') == 0) {
  289. return false;
  290. }
  291.  
  292. break;
  293. }
  294.  
  295. case self::ATTR_TYPE_SRC:
  296. {
  297. if ($this->local_resources) {
  298. $url = parse_url($value);
  299.  
  300. if ($url == false) {
  301. return false;
  302. }
  303.  
  304. // See if the domain specified actually matches the server's primary domain
  305. if (!empty($url['host'])) {
  306. // Unfortunately HTTP_HOST is the best option that isn't stupidly complicated or
  307. // prone to breakage, e.g. Url::base().
  308.  
  309. // TODO: this needs to handle subsites and all that mess
  310. $base_domain = $_SERVER['HTTP_HOST'];
  311. if (strcasecmp($url['host'], $base_domain) !== 0) {
  312. return false;
  313. }
  314. }
  315. }
  316.  
  317. break;
  318. }
  319.  
  320. case self::ATTR_TYPE_STYLE:
  321. {
  322. if ($this->local_resources) {
  323. // TODO: style attributes are tricky; I can force a browser to fetch a remote resource with them
  324. // so we either disallow (breaks TinyMCE styling) or parse and whitelist some CSS attributes
  325.  
  326. return false;
  327. }
  328.  
  329. break;
  330. }
  331.  
  332. default:
  333. break;
  334. }
  335.  
  336. return htmlspecialchars($value, ENT_COMPAT | ENT_HTML401 | ENT_DISALLOWED, 'UTF-8', false);
  337. }
  338. }
  339.