SproutCMS

This is the code documentation for the SproutCMS project

source of /sprout/Helpers/Text.php

Copyright (C) 2017 Karmabunny Pty Ltd.

This file is a part of SproutCMS.

SproutCMS is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation, either
version 2 of the License, or (at your option) any later version.

For more information, visit <http://getsproutcms.com>.

This class was originally from Kohana 2.3.4
Copyright 2007-2008 Kohana Team
  1. <?php
  2. /**
  3.  * Copyright (C) 2017 Karmabunny Pty Ltd.
  4.  *
  5.  * This file is a part of SproutCMS.
  6.  *
  7.  * SproutCMS is free software: you can redistribute it and/or modify it under the terms
  8.  * of the GNU General Public License as published by the Free Software Foundation, either
  9.  * version 2 of the License, or (at your option) any later version.
  10.  *
  11.  * For more information, visit <http://getsproutcms.com>.
  12.  *
  13.  * This class was originally from Kohana 2.3.4
  14.  * Copyright 2007-2008 Kohana Team
  15.  */
  16. namespace Sprout\Helpers;
  17.  
  18.  
  19.  
  20. /**
  21.  * Various text helpers such as limiting.
  22.  */
  23. class Text
  24. {
  25.  
  26. /**
  27.   * Limits a plain-text phrase to a given number of words.
  28.   *
  29.   * @param string $str Phrase to limit words of, in plain text
  30.   * @param int $limit Number of words to limit to
  31.   * @param string $end_char Characters to append if text is limited, e.g. '...'
  32.   * @return string Plain text
  33.   */
  34. public static function limitWords($str, $limit = 100, $end_char = NULL)
  35. {
  36. $limit = (int) $limit;
  37. $end_char = ($end_char === NULL) ? '…' : $end_char;
  38.  
  39. if (trim($str) === '')
  40. return $str;
  41.  
  42. if ($limit <= 0)
  43. return $end_char;
  44.  
  45. preg_match('/^\s*+(?:\S++\s*+){1,'.$limit.'}/u', $str, $matches);
  46.  
  47. // Only attach the end character if the matched string is shorter
  48. // than the starting string.
  49. return rtrim($matches[0]).(strlen($matches[0]) === strlen($str) ? '' : $end_char);
  50. }
  51.  
  52. /**
  53.   * Limits a plain-text phrase to a given number of characters.
  54.   *
  55.   * @param string Phrase to limit characters of, in plain text
  56.   * @param int $limit Number of characters to limit to
  57.   * @param string $end_char Characters to append if text is limited, e.g. '...'
  58.   * @param boolean $preserve_words True if whole words should be preserved; false to allow ending on a partial word
  59.   * @return string Plain text
  60.   */
  61. public static function limitChars($str, $limit = 100, $end_char = NULL, $preserve_words = FALSE)
  62. {
  63. $end_char = ($end_char === NULL) ? '…' : $end_char;
  64.  
  65. $limit = (int) $limit;
  66.  
  67. if (trim($str) === '' OR mb_strlen($str) <= $limit)
  68. return $str;
  69.  
  70. if ($limit <= 0)
  71. return $end_char;
  72.  
  73. if ($preserve_words == FALSE)
  74. {
  75. return rtrim(mb_substr($str, 0, $limit)).$end_char;
  76. }
  77.  
  78. preg_match('/^.{'.($limit - 1).'}\S*/us', $str, $matches);
  79.  
  80. return rtrim($matches[0]).(strlen($matches[0]) == strlen($str) ? '' : $end_char);
  81. }
  82.  
  83. /**
  84.   * Limits HTML to a certain number of words.
  85.   * Is aware of tags etc and will not count them in the word-count, as well as closing them properly.
  86.   *
  87.   * This doesn't actually pass all unit tests at the moment - an exact match in num words will still put in ... part.
  88.   **/
  89. public static function limitWordsHtml($text, $limit = 50)
  90. {
  91. $count = 0;
  92. $offset = 0;
  93. $over = 0;
  94. $out = '';
  95. $stack = array();
  96.  
  97. // These shouldn't have an end tag
  98. $single_tags = '/^(?:br|wbr|area|hr|img|input)$/i';
  99.  
  100. // Nuke HTML comments and duplicate space
  101. $text = preg_replace('/<!--.*?-->/s', '', $text);
  102. $text = preg_replace('/\s\s+/', ' ', $text);
  103.  
  104. // opening tag closing tag words non-words
  105. while (preg_match('!\G(<[a-z0-9]+[^>]*>)|(</[a-z0-9]+>)|([-_a-zA-Z0-9]+)|([^-_a-zA-Z0-9<>]+)!si', $text, $m, 0, $offset)) {
  106. if ($m[1]) {
  107. if ($over) { $out .= '...'; break; }
  108. preg_match('!^<([a-z0-9]+)[^>]*>$!i', $m[0], $matches);
  109. if (! preg_match($single_tags, $matches[1])) {
  110. array_push($stack, '</' . strtolower($matches[1]) . '>');
  111. }
  112. $out .= $m[0];
  113.  
  114. } elseif ($m[2]) {
  115. $m[0] = strtolower($m[0]);
  116. $pop = array_pop($stack);
  117. while ($pop != $m[0]) { $out .= $pop; $pop = array_pop($stack); }
  118. $out .= $pop;
  119.  
  120. } elseif ($m[3]) {
  121. if ($over) { $out .= '...'; break; }
  122. $out .= $m[0];
  123. $count++;
  124. if ($count == $limit) {
  125. $over++;
  126. }
  127.  
  128. } else {
  129. if ($over) { $out .= '...'; break; }
  130. $out .= $m[0];
  131. }
  132.  
  133. $offset += strlen($m[0]);
  134. }
  135.  
  136. while ($pop = array_pop($stack)) { $out .= $pop; }
  137.  
  138. return $out;
  139. }
  140.  
  141.  
  142. /**
  143.   * Determines whether given HTML contains a FORM tag, which can cause nested-forms issues
  144.   *
  145.   * Not tested with malformed input - should not be used as an XSS filter
  146.   *
  147.   * @param string $html HTML to check
  148.   * @return bool True if the string contains a FORM tag, false if it doesn't
  149.   */
  150. public static function containsFormTag($html)
  151. {
  152. // Quick test before even doing string manipulation
  153. if (stripos($html, '<form') === false) {
  154. return false;
  155. }
  156.  
  157. // These tags always contain CDATA so nuke them entirely
  158. $html = preg_replace('!<script[^>]*>.+?</script>!is', '', $html);
  159. $html = preg_replace('!<style[^>]*>.+?</style>!is', '', $html);
  160.  
  161. return (stripos($html, '<form') !== false);
  162. }
  163.  
  164.  
  165. /**
  166.   * Alternates between two or more strings.
  167.   *
  168.   * @param string strings to alternate between
  169.   * @return string
  170.   */
  171. public static function alternate()
  172. {
  173. static $i;
  174.  
  175. if (func_num_args() === 0)
  176. {
  177. $i = 0;
  178. return '';
  179. }
  180.  
  181. $args = func_get_args();
  182. return $args[($i++ % count($args))];
  183. }
  184.  
  185. /**
  186.   * Reduces multiple slashes in a string to single slashes.
  187.   *
  188.   * @param string string to reduce slashes of
  189.   * @return string
  190.   */
  191. public static function reduceSlashes($str)
  192. {
  193. return preg_replace('#(?<!:)//+#', '/', $str);
  194. }
  195.  
  196. /**
  197.   * Replaces the given words with a string.
  198.   *
  199.   * @param string $str Phrase to replace words in
  200.   * @param array $badwords Words to replace
  201.   * @param string $replacement Replacement string
  202.   * @param boolean $replace_partial_words Replace words across word
  203.   * boundaries (space, period, etc). This probably doesn't do what
  204.   * you think it does; check the test suite.
  205.   * @return string
  206.   */
  207. public static function censor($str, array $badwords, $replacement = '#', $replace_partial_words = FALSE)
  208. {
  209. foreach ($badwords as $key => $badword) {
  210. $badwords[$key] = str_replace('\*', '\S*?', preg_quote((string) $badword));
  211. }
  212.  
  213. $regex = '('.implode('|', $badwords).')';
  214.  
  215. if ($replace_partial_words == TRUE)
  216. {
  217. // Just using \b isn't sufficient when we need to replace a badword that already contains word boundaries itself
  218. $regex = '(?<=\b|\s|^)'.$regex.'(?=\b|\s|$)';
  219. }
  220.  
  221. $regex = '!'.$regex.'!ui';
  222.  
  223. if (mb_strlen($replacement) == 1) {
  224. $replace = function($matches) use ($replacement) {
  225. return str_repeat($replacement, mb_strlen($matches[1]));
  226. };
  227. return preg_replace_callback($regex, $replace, $str);
  228. }
  229.  
  230. return preg_replace($regex, $replacement, $str);
  231. }
  232.  
  233. /**
  234.   * Finds the text that is similar between a set of words.
  235.   *
  236.   * @param array words to find similar text of
  237.   * @return string
  238.   */
  239. public static function similar(array $words)
  240. {
  241. // First word is the word to match against
  242. $word = current($words);
  243.  
  244. for ($i = 0, $max = strlen($word); $i < $max; ++$i)
  245. {
  246. foreach ($words as $w)
  247. {
  248. // Once a difference is found, break out of the loops
  249. if ( ! isset($w[$i]) OR $w[$i] !== $word[$i])
  250. break 2;
  251. }
  252. }
  253.  
  254. // Return the similar text
  255. return substr($word, 0, $i);
  256. }
  257.  
  258. /**
  259.   * Converts text email addresses and anchors into links.
  260.   *
  261.   * @param string text to auto link
  262.   * @return string
  263.   */
  264. public static function autoLink($text)
  265. {
  266. // Auto link emails first to prevent problems with "www.domain.com@example.com"
  267. return Text::autoLinkUrls(Text::autoLinkEmails($text));
  268. }
  269.  
  270. /**
  271.   * Converts text anchors into links.
  272.   *
  273.   * @param string text to auto link
  274.   * @return string
  275.   */
  276. public static function autoLinkUrls($text)
  277. {
  278. // Finds all http/https/ftp/ftps links that are not part of an existing html anchor
  279. if (preg_match_all('~\b(?<!href="|">)(?:ht|f)tps?://\S+(?:/|\b)~i', $text, $matches))
  280. {
  281. foreach ($matches[0] as $match)
  282. {
  283. // Replace each link with an anchor
  284. $text = str_replace($match, Html::anchor($match), $text);
  285. }
  286. }
  287.  
  288. // Find all naked www.links.com (without http://)
  289. if (preg_match_all('~\b(?<!://)www(?:\.[a-z0-9][-a-z0-9]*+)+\.[a-z]{2,6}\b~i', $text, $matches))
  290. {
  291. foreach ($matches[0] as $match)
  292. {
  293. // Replace each link with an anchor
  294. $text = str_replace($match, Html::anchor('http://'.$match, $match), $text);
  295. }
  296. }
  297.  
  298. return $text;
  299. }
  300.  
  301. /**
  302.   * Converts text email addresses into links.
  303.   *
  304.   * @param string text to auto link
  305.   * @return string
  306.   */
  307. public static function autoLinkEmails($text)
  308. {
  309. // Finds all email addresses that are not part of an existing html mailto anchor
  310. // Note: The "58;" negative lookbehind prevents matching of existing encoded html mailto anchors
  311. // The html entity for a colon (:) is &#58; or &#058; or &#0058; etc.
  312. if (preg_match_all('~\b(?<!href="mailto:|">|58;)(?!\.)[-+_a-z0-9.]++(?<!\.)@(?![-.])[-a-z0-9.]+(?<!\.)\.[a-z]{2,6}\b~i', $text, $matches))
  313. {
  314. foreach ($matches[0] as $match)
  315. {
  316. // Replace each email with an encoded mailto
  317. $text = str_replace($match, Html::mailto($match), $text);
  318. }
  319. }
  320.  
  321. return $text;
  322. }
  323.  
  324. /**
  325.   * Automatically applies <p> and <br /> markup to text. Basically nl2br() on steroids.
  326.   *
  327.   * @param string subject
  328.   * @return string
  329.   */
  330. public static function autoP($str)
  331. {
  332. // Trim whitespace
  333. if (($str = trim($str)) === '')
  334. return '';
  335.  
  336. // Standardize newlines
  337. $str = str_replace(array("\r\n", "\r"), "\n", $str);
  338.  
  339. // Trim whitespace on each line
  340. $str = preg_replace('~^[ \t]+~m', '', $str);
  341. $str = preg_replace('~[ \t]+$~m', '', $str);
  342.  
  343. // The following regexes only need to be executed if the string contains html
  344. if ($html_found = (strpos($str, '<') !== FALSE))
  345. {
  346. // Elements that should not be surrounded by p tags
  347. $no_p = '(?:p|div|h[1-6r]|ul|ol|li|blockquote|d[dlt]|pre|t[dhr]|t(?:able|body|foot|head)|c(?:aption|olgroup)|form|s(?:elect|tyle)|a(?:ddress|rea)|ma(?:p|th))';
  348.  
  349. // Put at least two linebreaks before and after $no_p elements
  350. $str = preg_replace('~^<'.$no_p.'[^>]*+>~im', "\n$0", $str);
  351. $str = preg_replace('~</'.$no_p.'\s*+>$~im', "$0\n", $str);
  352. }
  353.  
  354. // Do the <p> magic!
  355. $str = '<p>'.trim($str).'</p>';
  356. $str = preg_replace('~\n{2,}~', "</p>\n\n<p>", $str);
  357.  
  358. // The following regexes only need to be executed if the string contains html
  359. if ($html_found !== FALSE)
  360. {
  361. // Remove p tags around $no_p elements
  362. $str = preg_replace('~<p>(?=</?'.$no_p.'[^>]*+>)~i', '', $str);
  363. $str = preg_replace('~(</?'.$no_p.'[^>]*+>)</p>~i', '$1', $str);
  364. }
  365.  
  366. // Convert single linebreaks to <br />
  367. $str = preg_replace('~(?<!\n)\n(?!\n)~', "<br />\n", $str);
  368.  
  369. return $str;
  370. }
  371.  
  372. /**
  373.   * Returns human readable sizes.
  374.   * @see Based on original functions written by:
  375.   * @see Aidan Lister: http://aidanlister.com/repos/v/function.size_readable.php
  376.   * @see Quentin Zervaas: http://www.phpriot.com/d/code/strings/filesize-format/
  377.   *
  378.   * @param integer size in bytes
  379.   * @param string a definitive unit
  380.   * @param string the return string format
  381.   * @param boolean whether to use SI prefixes or IEC
  382.   * @return string
  383.   */
  384. public static function bytes($bytes, $force_unit = NULL, $format = NULL, $si = TRUE)
  385. {
  386. // Format string
  387. $format = ($format === NULL) ? '%01.2f %s' : (string) $format;
  388.  
  389. // IEC prefixes (binary)
  390. if ($si == FALSE OR strpos($force_unit, 'i') !== FALSE)
  391. {
  392. $units = array('B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB');
  393. $mod = 1024;
  394. }
  395. // SI prefixes (decimal)
  396. else
  397. {
  398. $units = array('B', 'kB', 'MB', 'GB', 'TB', 'PB');
  399. $mod = 1000;
  400. }
  401.  
  402. // Determine unit to use
  403. if (($power = array_search((string) $force_unit, $units)) === FALSE)
  404. {
  405. $power = ($bytes > 0) ? floor(log($bytes, $mod)) : 0;
  406. }
  407.  
  408. return sprintf($format, $bytes / pow($mod, $power), $units[$power]);
  409. }
  410.  
  411. /**
  412.   * Prevents widow words by inserting a non-breaking space between the last two words.
  413.   * @see http://www.shauninman.com/archive/2006/08/22/widont_wordpress_plugin
  414.   *
  415.   * @param string string to remove widows from
  416.   * @return string
  417.   */
  418. public static function widont($str)
  419. {
  420. $str = rtrim($str);
  421. $space = strrpos($str, ' ');
  422.  
  423. if ($space !== FALSE)
  424. {
  425. $str = substr($str, 0, $space).'&nbsp;'.substr($str, $space + 1);
  426. }
  427.  
  428. return $str;
  429. }
  430.  
  431.  
  432. /**
  433.   * Returns a number with an english suffix appended (e.g. 1st, 5th, 12th, 123rd)
  434.   **/
  435. public static function ordinalize($number)
  436. {
  437. if ($number % 100 == 11 or $number % 100 == 12 or $number % 100 == 13) {
  438. return $number . 'th';
  439. }
  440.  
  441. switch ($number % 10) {
  442. case 1:
  443. return $number . 'st';
  444. case 2:
  445. return $number . 'nd';
  446. case 3:
  447. return $number . 'rd';
  448. default:
  449. return $number . 'th';
  450. }
  451. }
  452.  
  453.  
  454. /**
  455.   * Make a chunk of valid HTML into plain text, and (optionally) limit the number of words.
  456.   *
  457.   * @param string $html The original HTML
  458.   * @param int $max_words The maximum number of words. Use 0 for no limit.
  459.   * @return string Plain text
  460.   **/
  461. public static function plain($html, $max_words = 50)
  462. {
  463. $html = Enc::cleanfunky($html);
  464.  
  465. // Normalise newlines into spaces
  466. $html = str_replace(["\r", "\n"], ' ', $html);
  467.  
  468. // Replace some HTML tags with newlines
  469. $html = preg_replace('!<(p|div|h[1-6]|pre|ol|ul)[^>]*?>!i', "\n\n", $html);
  470. $html = preg_replace('!<(br|li)[^>]*?>!i', "\n", $html);
  471.  
  472. // Remove inline style and script tags
  473. $html = preg_replace('!<style[^>]*>.+?<\/style>!i', '', $html);
  474. $html = preg_replace('!<script[^>]*>.+?<\/script>!i', '', $html);
  475.  
  476. // Remove all other tags, and decode entities
  477. $html = strip_tags($html);
  478. $html = html_entity_decode($html, ENT_COMPAT, 'UTF-8');
  479.  
  480. // Combine runs of multiple whitespace
  481. $html = preg_replace("![ \t][ \t]+!", ' ', $html);
  482.  
  483. // Trim whitespace on each line
  484. $lines = explode("\n", $html);
  485. foreach ($lines as &$l) {
  486. $l = trim($l);
  487. }
  488. $html = implode("\n", $lines);
  489.  
  490. if ($max_words) {
  491. $html = Text::limitWords($html, $max_words, '...');
  492. }
  493.  
  494. // Tidy up nbsp characters that break iconv.
  495. $html = str_replace("\u{00a0}", ' ', $html);
  496.  
  497. return trim($html);
  498. }
  499.  
  500.  
  501. /**
  502.   * Make a chunk of plain text into HTML rich text
  503.   * The text will be wrapped within a block element (default is a P tag)
  504.   *
  505.   * @param string $text The original plain text
  506.   * @param string $block_elem The block element to use. Default is a P tag (i.e. 'p').
  507.   * Use null or empty string to get the result without it being wrapped in a tag.
  508.   * @return string A HTML representation of the plain text
  509.   **/
  510. public static function richtext($text, $block_elem = 'p')
  511. {
  512. $block_elem = strtolower(trim($block_elem));
  513.  
  514. $text = Enc::cleanfunky($text);
  515. $text = Enc::html($text);
  516. $text = str_replace(array("\r\n", "\r", "\n"), '<br>', $text);
  517.  
  518. if (!$block_elem) return $text;
  519.  
  520. return "<{$block_elem}>{$text}</{$block_elem}>";
  521. }
  522.  
  523.  
  524. /**
  525.   * Convert a lower_case names into CamelCaps names
  526.   *
  527.   * @param string $name
  528.   * @return string
  529.   **/
  530. public static function lc2camelcaps($name)
  531. {
  532. '/([a-z0-9])_([a-z0-9])/i',
  533. function($matches) {
  534. return $matches[1] . strtoupper($matches[2]);
  535. },
  536. $name
  537. );
  538. $name = ucfirst($name);
  539. return $name;
  540. }
  541.  
  542.  
  543. /**
  544.   * Convert a lower_case names into camelCase names
  545.   *
  546.   * @param string $name
  547.   * @return string
  548.   **/
  549. public static function lc2camelcase($name)
  550. {
  551. '/([a-z0-9])_([a-z0-9])/i',
  552. function($matches) {
  553. return $matches[1] . strtoupper($matches[2]);
  554. },
  555. $name
  556. );
  557. $name = lcfirst($name);
  558. return $name;
  559. }
  560.  
  561.  
  562. /**
  563.   * Convert a CamelCaps or camelCase name into a lower_case names
  564.   *
  565.   * @param string $name
  566.   * @return string
  567.   **/
  568. public static function camel2lc($name)
  569. {
  570. '/[A-Z0-9]/',
  571. function($matches) {
  572. return '_' . strtolower($matches[0]);
  573. },
  574. $name
  575. );
  576. $name = ltrim($name, '_');
  577. return $name;
  578. }
  579.  
  580.  
  581. /**
  582.   * Encode HTML so it's suitable for direct output, but allow some HTML tags to be left as-is
  583.   *
  584.   * Only a limited subset of tags are left alone, all other tags are stripped.
  585.   * Allowed tags: A, B, I, STRONG, EM, BR, IMG, SPAN, ABBR, SUP, SUB
  586.   *
  587.   * The algorithm used in this method is quite simple, so this method should not be used
  588.   * as a defence against XSS attacks; it should only be used on trusted input such as Form helptext.
  589.   *
  590.   * @param string $html Plain text or HTML which may contain various tags
  591.   * @return string HTML which only contains safe tags
  592.   */
  593. public static function limitedSubsetHtml($html)
  594. {
  595. static $allowed = ['a', 'b', 'i', 'strong', 'em', 'br', 'img', 'span', 'abbr', 'sup', 'sub'];
  596.  
  597. $offset = 0;
  598. $out = '';
  599.  
  600. // opening tag closing tag content
  601. while (preg_match('!\G(<[a-z0-9]+[^>]*>)|(</[a-z0-9]+>)|([^<>]+|<|>)!si', $html, $m, 0, $offset)) {
  602. if ($m[1]) {
  603. preg_match('!^<([a-z0-9]+)[^>]*>$!i', $m[0], $matches);
  604. if (in_array($matches[1], $allowed)) {
  605. $out .= $m[0];
  606. }
  607.  
  608. } else if ($m[2]) {
  609. if (in_array(substr($m[0], 2, -1), $allowed)) {
  610. $out .= $m[0];
  611. }
  612.  
  613. } else {
  614. $out .= Enc::html($m[0]);
  615. }
  616.  
  617. $offset += strlen($m[0]);
  618. }
  619.  
  620. return $out;
  621. }
  622.  
  623.  
  624. /**
  625.   * Returns current year or original year and current year of copyright
  626.   * @param string $year The original year of copyright
  627.   * @return string Current year, or Original year - Current year
  628.   */
  629. public static function copyright($year)
  630. {
  631. if (empty($year)) {
  632. return date('Y');
  633. }
  634.  
  635. $year = date('Y', strtotime($year . '-01-01'));
  636.  
  637. if ($year == date('Y')) {
  638. return $year;
  639. }
  640.  
  641. return $year . ' - ' . date('Y');
  642. }
  643. }
  644.  
  645.