SproutCMS

This is the code documentation for the SproutCMS project

source of /sprout/Helpers/WorkerLinkChecker.php

  1. <?php
  2. /*
  3.  * Copyright (C) 2017 Karmabunny Pty Ltd.
  4.  *
  5.  * This file is a part of SproutCMS.
  6.  *
  7.  * SproutCMS is free software: you can redistribute it and/or modify it under the terms
  8.  * of the GNU General Public License as published by the Free Software Foundation, either
  9.  * version 2 of the License, or (at your option) any later version.
  10.  *
  11.  * For more information, visit <http://getsproutcms.com>.
  12.  */
  13.  
  14. namespace Sprout\Helpers;
  15.  
  16. use DOMDocument;
  17.  
  18. use Kohana;
  19.  
  20.  
  21. class WorkerLinkChecker extends WorkerBase
  22. {
  23. protected $job_name = 'Link Checker';
  24.  
  25.  
  26. protected $metric_names = array(
  27. 1 => 'Total pages',
  28. 2 => 'Pages processed',
  29. 3 => 'Bad links found',
  30. );
  31.  
  32.  
  33. /**
  34.   * Do stuff
  35.   **/
  36. public function run($email_address = null)
  37. {
  38. $q = "SELECT page.id, page.subsite_id, page.name, MAX(rev.id) AS rev_id
  39. FROM ~pages AS page
  40. INNER JOIN ~page_revisions AS rev ON rev.page_id = page.id
  41. AND rev.status = 'live' AND rev.type = 'standard'
  42. WHERE page.active = 1
  43. GROUP BY page.id
  44. ORDER BY page.id";
  45. $res = Pdb::query($q, [], 'map-arr');
  46.  
  47. // Fetch and collate rich text widgets to produce page text
  48. if (count($res) > 0) {
  49. $rev_ids = [];
  50. foreach ($res as &$row) {
  51. $row['text'] = '';
  52. $rev_ids[] = (int) $row['rev_id'];
  53. }
  54. unset($row);
  55. $rev_ids = implode(', ', $rev_ids);
  56.  
  57. $q = "SELECT rev.id AS rev_id, widget.settings
  58. FROM ~page_revisions AS rev
  59. INNER JOIN ~page_widgets AS widget ON rev.id = widget.page_revision_id
  60. AND widget.area_id = 1 AND widget.type = 'RichText'
  61. WHERE rev.id IN ({$rev_ids})
  62. ORDER BY widget.record_order";
  63. $widgets = Pdb::q($q, [], 'pdo');
  64. foreach ($widgets as $widget) {
  65. $settings = json_decode($widget['settings'], true);
  66. foreach ($res as &$row) {
  67. if ($row['rev_id'] == $widget['rev_id']) {
  68. if ($row['text']) $row['text'] .= "\n";
  69. $row['text'] .= $settings['text'];
  70. break;
  71. }
  72. }
  73. unset($row);
  74. }
  75. $widgets->closeCursor();
  76. }
  77.  
  78. Worker::message('Found ' . count($res) . ' page(s) total');
  79. Worker::metric(1, count($res));
  80. Worker::metric(2, 0);
  81. Worker::metric(3, 0);
  82.  
  83. $errs = array();
  84. $processed = 0;
  85. $found = 0;
  86. foreach ($res as $row) {
  87. Worker::message("Checking page # {$row['id']}; '{$row['name']}'");
  88. $processed++;
  89. $found += $this->checkPage($row, $errs);
  90. Worker::metric(2, $processed);
  91. Worker::metric(3, $found);
  92. }
  93.  
  94. Worker::message('');
  95. Worker::message(count($errs) . ' pages have bad link(s)');
  96. Worker::message($found . ' bad link(s) total');
  97. Worker::message('');
  98.  
  99. if (count($errs) > 0) {
  100. Worker::message("Preparing HTML report");
  101.  
  102. $view = new View('sprout/email/link_checker');
  103. $view->errs = $errs;
  104. $view = $view->render();
  105.  
  106.  
  107. Worker::message("Preparing CSV report");
  108.  
  109. $csv = $this->buildCsv($errs);
  110.  
  111.  
  112. Worker::message('');
  113. Worker::message("Sending reports via email");
  114.  
  115. if ($email_address) {
  116. $ops = array(array(
  117. 'name' => 'Unknown user',
  118. 'email' => $email_address,
  119. ));
  120. } else {
  121. $ops = AdminPerms::getOperatorsWithAccess('access_reportemail');
  122. }
  123.  
  124. $sent = 0;
  125. foreach ($ops as $row) {
  126. if ($row['email'] == '') continue;
  127.  
  128. $mail = new Email();
  129. $mail->AddAddress($row['email']);
  130. $mail->Subject = 'Link checker report for site ' . Kohana::config('sprout.site_title');
  131. $mail->SkinnedHTML($view);
  132. $mail->AddStringAttachment($csv, 'link_checker_report_' . date('Y_m_d') . '.csv', 'base64', 'text/csv');
  133. $result = $mail->Send();
  134.  
  135. if ($result) {
  136. Worker::message("Sent report to {$row['name']} ({$row['email']})");
  137. $sent++;
  138. } else {
  139. Worker::message("Sending of report to {$row['name']} ({$row['email']}) failed!");
  140. }
  141. }
  142.  
  143. Worker::message("{$sent} email(s) sent successfully.");
  144. }
  145.  
  146. Worker::message('');
  147. Worker::success();
  148. }
  149.  
  150.  
  151. /**
  152.   * Checks a single page
  153.   **/
  154. private function checkPage(&$row, &$errs)
  155. {
  156. $dom = new DOMDocument();
  157. if (! @$dom->loadHTML($row['text'])) return;
  158.  
  159. $resultname = $row['id'] . ':' . $row['subsite_id'] . ':' . $row['name'];
  160.  
  161. $as = $dom->getElementsByTagName('a');
  162. $numfound = 0;
  163. foreach ($as as $elem) {
  164. $href = $elem->getAttribute('href');
  165. $href = urldecode($href);
  166. $href = str_replace(' ', '%20', $href);
  167.  
  168. $found = $this->checkUrl($href, $row['subsite_id']);
  169.  
  170. if ($found !== true) {
  171. $errs[$resultname][] = array('page_id' => $row['id'], 'page_name' => $row['name'], 'link_href' => $href, 'link_text' => $elem->textContent, 'err' => $found);
  172. $numfound++;
  173. }
  174. }
  175.  
  176. unset($dom);
  177.  
  178. return $numfound;
  179. }
  180.  
  181.  
  182. /**
  183.   * Returns TRUE if the given URL is found, a string of the error message if the URL was not found.
  184.   **/
  185. public function checkUrl($href, $subsite_id = 1)
  186. {
  187. $href = trim($href);
  188.  
  189. if (preg_match('/^(javascript|mailto|news|irc|file|data|sms|tel|callto|skype|chrome|about|ftp):/i', $href)) {
  190. return true;
  191. }
  192.  
  193. if (! preg_match('!^[a-z]+://!i', $href)) {
  194. $href = Subsites::getAbsRoot($subsite_id) . trim($href, '/');
  195. }
  196.  
  197. if (! preg_match('!^http!', $href)) {
  198. return '599 Not a URL';
  199. }
  200.  
  201. // If behind a proxy, we cannot see ourselves properly.
  202. // TODO: Try to think up a way this could be made to work.
  203. if (preg_match('!://localhost/!', $href) and $_SERVER['SERVER_PORT'] != 80) {
  204. return true;
  205. }
  206.  
  207. $href = str_replace(' ', '%20', $href);
  208.  
  209. // TODO This whole things should be using HttpReq but I'm in a rush rn.
  210.  
  211. $opts = array('http' => array(
  212. 'method' => 'HEAD',
  213. 'follow_location' => true,
  214. 'ignore_errors' => false,
  215. 'user_agent' => 'SproutLinkChecker/' . Sprout::getVersion() . ' (PHP/' . phpversion() . ')',
  216. ));
  217. $opts['ssl'] = array(
  218. 'cafile' => APPPATH . 'cacert.pem',
  219. );
  220. $context = stream_context_create($opts);
  221.  
  222.  
  223. $h = @fopen($href, 'r', false, $context);
  224.  
  225. if ($h === false) {
  226. if (empty($http_response_header)) {
  227. return '599 Not a URL';
  228. }
  229.  
  230. foreach ($http_response_header as $hdr) {
  231. if (strpos($hdr, 'HTTP') === 0) {
  232. $status_line = $hdr;
  233. }
  234. }
  235.  
  236. if (! preg_match('/([0-9][0-9][0-9]).*$/', $status_line, $matches)) {
  237. return '599 Invalid response';
  238. }
  239.  
  240. list($message, $code) = $matches;
  241.  
  242. if ($code >= 400 and $code <= 599) {
  243. return $message;
  244. }
  245.  
  246. return '599 Network Error';
  247. }
  248.  
  249.  
  250. fclose($h);
  251. return true;
  252. }
  253.  
  254.  
  255. /**
  256.   * Return a CSV for the specified errors
  257.   **/
  258. private function buildCsv(&$errs)
  259. {
  260. $csv = array();
  261.  
  262. foreach ($errs as $ee) {
  263. foreach ($ee as $eee) {
  264. $csv[] = $eee;
  265. }
  266. }
  267.  
  268. return QueryTo::csv($csv);
  269. }
  270.  
  271. }
  272.