SproutCMS

This is the code documentation for the SproutCMS project

source of /sprout/Helpers/FileIndexing.php

  1. <?php
  2. /*
  3.  * Copyright (C) 2017 Karmabunny Pty Ltd.
  4.  *
  5.  * This file is a part of SproutCMS.
  6.  *
  7.  * SproutCMS is free software: you can redistribute it and/or modify it under the terms
  8.  * of the GNU General Public License as published by the Free Software Foundation, either
  9.  * version 2 of the License, or (at your option) any later version.
  10.  *
  11.  * For more information, visit <http://getsproutcms.com>.
  12.  */
  13.  
  14. namespace Sprout\Helpers;
  15.  
  16.  
  17. /**
  18. * Does indexing for various file formats
  19. **/
  20. class FileIndexing
  21. {
  22.  
  23. /**
  24.   * Returns the extension of the specified file.
  25.   * By the way, this value is always lowercased, because the other helpers expect that.
  26.   *
  27.   * @param string $filename The filename to get the ext of.
  28.   **/
  29. static public function getExt($filename)
  30. {
  31. $parts = explode ('.', $filename);
  32. $ext = array_pop ($parts);
  33. $ext = strtolower($ext);
  34. return $ext;
  35. }
  36.  
  37. /**
  38.   * Returns true if the specified extension is supported,
  39.   * false if it is not
  40.   *
  41.   * @param string $ext The extension to check
  42.   * @return boolean True if supported, false otherwise
  43.   **/
  44. static public function isExtSupported($ext)
  45. {
  46. if (! function_exists('exec')) return false;
  47.  
  48. switch ($ext) {
  49. case 'txt':
  50. case 'csv':
  51. return true;
  52. }
  53.  
  54. if (! function_exists('exec')) return false;
  55. if (! function_exists('escapeshellarg')) return false;
  56. if (! function_exists('shell_exec')) return false;
  57.  
  58. switch ($ext) {
  59. case 'pdf':
  60. exec ('pdftotext -v', $output, $return);
  61. if ($return != 127) {
  62. return true;
  63. }
  64. break;
  65.  
  66. case 'doc':
  67. exec ('antiword', $output, $return);
  68. if ($return != 127) {
  69. return true;
  70. }
  71. break;
  72.  
  73. case 'docx':
  74. exec ('perl -v', $output, $return);
  75. if ($return != 127) {
  76. return true;
  77. }
  78. break;
  79.  
  80. case 'odt':
  81. exec ('odt2txt --version', $output, $return);
  82. if ($return != 127) {
  83. return true;
  84. }
  85. break;
  86.  
  87. case 'xls':
  88. exec ('xls2csv', $output, $return);
  89. if ($return != 127) {
  90. return true;
  91. }
  92. break;
  93.  
  94. }
  95.  
  96. return false;
  97. }
  98.  
  99.  
  100. /**
  101.   * Returns the plaintext version of a formatted file.
  102.   * Returns null on error.
  103.   *
  104.   * @param string $filename The file to process.
  105.   * @param string $ext Allows the file type to be forced.
  106.   * @return string The plain text, or null if there was an error.
  107.   **/
  108. static public function getPlaintext($filename, $ext = null)
  109. {
  110. $unlink = false;
  111.  
  112. if (! $ext) $ext = self::getExt($filename);
  113.  
  114. if ($filename[0] == '/') {
  115. $index_filename = $filename;
  116. } else {
  117. $index_filename = File::createLocalCopy($filename);
  118. $unlink = true;
  119. }
  120.  
  121. switch ($ext) {
  122. case 'txt':
  123. case 'csv':
  124. return file_get_contents($index_filename);
  125.  
  126. case 'pdf':
  127. return self::getPdf($index_filename);
  128.  
  129. case 'doc':
  130. return self::getDoc($index_filename);
  131.  
  132. case 'docx':
  133. return self::getDocx($index_filename);
  134.  
  135. case 'odt':
  136. return self::getOdt($index_filename);
  137.  
  138. case 'xls':
  139. return self::getXls($index_filename);
  140.  
  141. }
  142.  
  143. if ($unlink) {
  144. File::cleanupLocalCopy($index_filename);
  145. }
  146.  
  147. return null;
  148. }
  149.  
  150.  
  151. /**
  152.   * Uses 'pdftotext' to get the contents of a pdf
  153.   *
  154.   * @param $filename The filename to process.
  155.   * @return string The plaintext version of the file, or null if there was an error.
  156.   **/
  157. static private function getPdf($filename)
  158. {
  159. $filename = escapeshellarg ($filename);
  160. return shell_exec("pdftotext {$filename} -");
  161. }
  162.  
  163. /**
  164.   * Uses 'antiword' to get the contents of a doc
  165.   *
  166.   * @param $filename The filename to process.
  167.   * @return string The plaintext version of the file, or null if there was an error.
  168.   **/
  169. static private function getDoc($filename)
  170. {
  171. $filename = escapeshellarg ($filename);
  172. return shell_exec("antiword {$filename}");
  173. }
  174.  
  175. /**
  176.   * Uses a perl script to get the contents of a docx
  177.   *
  178.   * @param $filename The filename to process.
  179.   * @return string The plaintext version of the file, or null if there was an error.
  180.   **/
  181. static private function getDocx($filename)
  182. {
  183. $filename = escapeshellarg ($filename);
  184. return shell_exec ("perl indexing/docx2txt.pl {$filename} -");
  185. }
  186.  
  187. /**
  188.   * Uses a 'odt2txt' to get the contents of a odt
  189.   *
  190.   * @param $filename The filename to process.
  191.   * @return string The plaintext version of the file, or null if there was an error.
  192.   **/
  193. static private function getOdt($filename)
  194. {
  195. $filename = escapeshellarg ($filename);
  196. return shell_exec("odt2txt {$filename}");
  197. }
  198.  
  199. /**
  200.   * Uses a 'xls2csv' to get the contents of a xls
  201.   *
  202.   * @param $filename The filename to process.
  203.   * @return string The plaintext version of the file, or null if there was an error.
  204.   **/
  205. static private function getXls($filename)
  206. {
  207. $filename = escapeshellarg ($filename);
  208. return shell_exec("xls2csv {$filename}");
  209. }
  210. }
  211.  
  212.  
  213.