SproutCMS

This is the code documentation for the SproutCMS project

source of /sprout/Helpers/DocImport/DocImportDOCX.php

  1. <?php
  2. /*
  3.  * Copyright (C) 2017 Karmabunny Pty Ltd.
  4.  *
  5.  * This file is a part of SproutCMS.
  6.  *
  7.  * SproutCMS is free software: you can redistribute it and/or modify it under the terms
  8.  * of the GNU General Public License as published by the Free Software Foundation, either
  9.  * version 2 of the License, or (at your option) any later version.
  10.  *
  11.  * For more information, visit <http://getsproutcms.com>.
  12.  */
  13.  
  14. namespace Sprout\Helpers\DocImport;
  15.  
  16. use DOMDocument;
  17. use DOMElement;
  18. use ZipArchive;
  19.  
  20. use Sprout\Helpers\Enc;
  21. use Sprout\Helpers\File;
  22.  
  23.  
  24. class DocImportDOCX extends DocImport
  25. {
  26. private $zip;
  27. private $number_formats;
  28. private $styles;
  29. private $relationships;
  30. private $res;
  31.  
  32.  
  33. /**
  34.   * The main load function for a document.
  35.   * Throw an exception on error.
  36.   *
  37.   * @param string $filename The file. The file will exist, but may not be valid
  38.   * @return string|DOMDocument $data Resultant XML data as a string or DOMDocument element
  39.   */
  40. public function load($filename)
  41. {
  42. $this->number_formats = [];
  43. $this->styles = [];
  44. $this->relationships = [];
  45. $this->res = [];
  46. $out = '';
  47.  
  48. $this->zip = new ZipArchive();
  49. $this->zip->open($filename);
  50.  
  51. $this->number_formats = $this->loadFormats();
  52. $this->styles = $this->loadStyles();
  53. $this->numbersFromStyles();
  54. $this->relationships = $this->loadRelationships();
  55.  
  56. $doc = new DOMDocument();
  57. $doc->loadXML($this->zip->getFromName('word/document.xml'));
  58. $body = $doc->firstChild->getElementsByTagName('body');
  59.  
  60. if ($body->length == 0) return null;
  61. $body = $body->item(0);
  62.  
  63. if (!$body instanceof DOMElement) return null;
  64. if ($body->tagName != 'w:body') return null;
  65. if ($body->childNodes->length == 0) return null;
  66.  
  67. $out .= '<?xml version="1.0" encoding="UTF-8" ?>' . PHP_EOL;
  68. $out .= '<doc>' . PHP_EOL;
  69. $out .= '<body>' . PHP_EOL;
  70. $out .= $this->block($body);
  71. $out .= '</body>' . PHP_EOL;
  72.  
  73. foreach ($this->res as $name => $data) {
  74. $out .= '<res name="' . htmlspecialchars($name) . '">' . base64_encode($data) . '</res>' . PHP_EOL;
  75. }
  76.  
  77. $out .= '</doc>';
  78.  
  79. $this->zip->close();
  80.  
  81. return $out;
  82. }
  83.  
  84.  
  85. /**
  86.   * Validates element as block display
  87.   *
  88.   * @param DOMElement $elem
  89.   * @return bool True Valid block element
  90.   * @return bool False Invalid block element
  91.   */
  92. private function isValidBlockElem($elem)
  93. {
  94. if (!in_array($elem->tagName, ['w:p', 'w:tbl'])) return false;
  95.  
  96. $runs = $this->renderBlockRuns($elem);
  97.  
  98. if (strip_tags($runs, '<img>') == '') return false;
  99.  
  100. return true;
  101. }
  102.  
  103.  
  104. /**
  105.   * Draw block element
  106.   *
  107.   * @param DOMElement $elem
  108.   * @return string HTML
  109.   */
  110. private function block($elem)
  111. {
  112. $list_stack = [];
  113. $list_fmt = null;
  114. $list_lvl = 0;
  115. $out = [];
  116.  
  117. foreach ($elem->childNodes as $para) {
  118. if (!$para instanceof DOMElement) continue;
  119.  
  120. // Tables
  121. if ($para->tagName == 'w:tbl') {
  122. while ($tag = array_pop($list_stack)) {
  123. $out[] = "</{$tag}>";
  124. }
  125. $out[] = $this->drawTable($para);
  126. continue;
  127. }
  128.  
  129. // Handle tags like w:bookmarkStart, as well as esoteric ones like w:moveToRangeEnd
  130. if ($para->tagName != 'w:p') {
  131. continue;
  132. }
  133.  
  134. // Render the inner tags, drop if empty
  135. $runs = $this->renderBlockRuns($para);
  136. if (strip_tags($runs, '<img>') == '') continue;
  137.  
  138. // Determine the style
  139. $style = $this->determineStyle($para);
  140.  
  141. // Look for style changes (para <-> list)
  142. if ($style['number_format'] != $list_fmt or $style['number_level'] != $list_lvl) {
  143. $listtag = $this->determineListTag($style);
  144. $out[] = "<{$listtag}>";
  145. array_push($list_stack, $listtag);
  146.  
  147. $list_fmt = $style['number_format'];
  148. $list_lvl = $style['number_level'];
  149. }
  150.  
  151. // Find the next sibling which is a tag we support (paragraphs and tables)
  152. $nextSibling = $para->nextSibling;
  153. while ($nextSibling and !$this->isValidBlockElem($nextSibling)) {
  154. $nextSibling = $nextSibling->nextSibling;
  155. }
  156.  
  157. // Take a look at the next el to see if we will be raising or dropping soon.
  158. $lvlraise = false;
  159. $lvldrop = false;
  160. $typechange = false;
  161. if ($style['number_format'] and $nextSibling) {
  162. $nextstyle = $this->determineStyle($nextSibling);
  163. if ($nextstyle['number_format'] != $style['number_format'] and $nextstyle['number_level'] == $style['number_level']) {
  164. $typechange = true;
  165. if ($nextstyle['number_format'] == '') $lvldrop = true;
  166.  
  167. } else if ($nextstyle['number_level'] < $style['number_level']) {
  168. $lvldrop = true;
  169. } else if ($nextstyle['number_level'] > $style['number_level']) {
  170. $lvlraise = true;
  171. }
  172. }
  173.  
  174. // Render the list item or the paragraph
  175. if ($lvlraise) {
  176. $out[] = "<li>{$runs}";
  177.  
  178. } else if ($list_fmt) {
  179. $out[] = "<li>{$runs}</li>";
  180.  
  181. } else {
  182. $tag = $this->determineParaTag($style);
  183.  
  184. if ($tag[0] == 'h') {
  185. $has_images = preg_match('!<img .+? />!', $runs, $image_tags);
  186.  
  187. // Remove tags from the heading
  188. $heading = trim(strip_tags($runs));
  189.  
  190. // Headings in ALL CAPS get converted to Title Case.
  191. if (!preg_match('![a-z]!', $heading)) {
  192. $heading = ucwords(strtolower($heading));
  193. }
  194.  
  195. // If we actually got any content, output it
  196. if ($heading) {
  197. $out[] = "<{$tag}>{$heading}</{$tag}>";
  198. }
  199.  
  200. // If we found images, inject them in a P tag afterwards
  201. if ($has_images) {
  202. $out[] = '<p>' . implode('', $image_tags) . '</p>';
  203. }
  204.  
  205. } else {
  206. $out[] = "<{$tag}>{$runs}</{$tag}>";
  207. }
  208. }
  209.  
  210. // If there was a type change or level drop, pop the UL/OL element
  211. if ($typechange or $lvldrop) {
  212. $listtag = array_pop($list_stack);
  213. $out[] = "</{$listtag}>";
  214. }
  215. if ($lvldrop) {
  216. $list_fmt = $nextstyle['number_format'];
  217. $list_lvl = $nextstyle['number_level'];
  218. if (count($list_stack)) $out[] = "</li>";
  219. }
  220. }
  221.  
  222. // Pop any remaining UL or OL elements
  223. while ($tag = array_pop($list_stack)) {
  224. $out[] = "</{$tag}>";
  225. }
  226.  
  227. return implode(PHP_EOL, $out) . PHP_EOL;
  228. }
  229.  
  230.  
  231. /**
  232. * Draw a w:tbl element
  233. *
  234.   * @param DOMElement $elem
  235.   * @return string HTML table
  236.   */
  237. private function drawTable($elem)
  238. {
  239. $out = '<table class="table--content-standard">' . PHP_EOL;
  240.  
  241. $rows = $elem->getElementsByTagName('tr');
  242. foreach ($rows as $row) {
  243. $out .= '<tr>' . PHP_EOL;
  244.  
  245. $cells = $row->getElementsByTagName('tc');
  246. foreach ($cells as $cell) {
  247. $paras = $cell->getElementsByTagName('p');
  248.  
  249. $rendered = [];
  250. foreach($paras as $p) {
  251. $rendered[] = $this->renderBlockRuns($p);
  252. }
  253.  
  254. $out .= '<td>';
  255. $out .= implode('<br/>', $rendered);
  256. $out .= '</td>' . PHP_EOL;
  257. }
  258.  
  259. $out .= '</tr>' . PHP_EOL;
  260. }
  261.  
  262. $out .= '</table>' . PHP_EOL;
  263.  
  264. return $out;
  265. }
  266.  
  267.  
  268. /**
  269.   * Render all the runs (i.e. w:r elements) for a given block element
  270.   *
  271.   * You would think this would be a simple draw_runs call on the getElementsByTagName,
  272.   * but we would never actually have it _that_ easy...
  273.   *
  274.   * @param DOMElement $block
  275.   * @return string XML tags representing the run content
  276.   */
  277. private function renderBlockRuns($block)
  278. {
  279. $runs = [];
  280.  
  281. foreach ($block->childNodes as $child) {
  282. if (! $child instanceof DOMElement) continue;
  283.  
  284. if ($child->tagName == 'w:r') {
  285. $runs[] = new DocImportDOCXRun($child);
  286.  
  287. } else if ($child->tagName == 'w:hyperlink') {
  288. $href = $this->relationships[$child->getAttribute('r:id')];
  289.  
  290. $run = new DocImportDOCXRun($child);
  291. if ($href) {
  292. $run->rendered = '<a href="' . Enc::xml($href) . '">' . $this->renderBlockRuns($child) . '</a>';
  293. } else {
  294. $run->rendered = $this->renderBlockRuns($child);
  295. }
  296. $runs[] = $run;
  297.  
  298. } else if ($child->tagName == 'w:smartTag' or $child->tagName == 'w:ins') {
  299. $childRuns = $child->getElementsByTagName('r');
  300. foreach ($childRuns as $run) {
  301. $runs[] = new DocImportDOCXRun($run);
  302. }
  303. }
  304. }
  305.  
  306. return trim($this->drawRuns($runs));
  307. }
  308.  
  309.  
  310. /**
  311.   * Output one or more `w:r` elements
  312.   *
  313.   * @param array $runs
  314.   * @return string
  315.   */
  316. private function drawRuns($runs)
  317. {
  318. $out = '';
  319. $currBold = false;
  320. $currItalic = false;
  321. $currHyperlink = false;
  322. $currSubscript = false;
  323. $currSuperscript = false;
  324. $tagStack = [];
  325.  
  326. foreach ($runs as $run) {
  327. if (!empty($run->rendered)) {
  328. $out .= $run->rendered;
  329. continue;
  330. }
  331.  
  332. $runElem = $run->elem;
  333. $newBold = false;
  334. $newItalic = false;
  335. $newSubscript = false;
  336. $newSuperscript = false;
  337. $symbolDecode = false;
  338.  
  339. $rpr = $runElem->getElementsByTagName('rPr');
  340. if ($rpr->length) {
  341. foreach ($rpr->item(0)->childNodes as $node) {
  342. if (! $node instanceof DOMElement) continue;
  343.  
  344. switch ($node->tagName) {
  345. case 'w:rStyle':
  346. $style = $this->styles[$node->getAttribute('w:val')];
  347. if ($style) {
  348. if (!empty($style['bold'])) $newBold = true;
  349. if (!empty($style['italic'])) $newItalic = true;
  350. }
  351. break;
  352.  
  353. case 'w:b':
  354. $newBold = ($node->getAttribute('w:val') !== 'false' and $node->getAttribute('w:val') !== '0');
  355. break;
  356.  
  357. case 'w:i':
  358. $newItalic = ($node->getAttribute('w:val') !== 'false' and $node->getAttribute('w:val') !== '0');
  359. break;
  360.  
  361. case 'w:vertAlign':
  362. if ($node->getAttribute('w:val') == 'subscript') {
  363. $newSubscript = true;
  364. } else if ($node->getAttribute('w:val') == 'superscript') {
  365. $newSuperscript = true;
  366. }
  367. break;
  368.  
  369. case 'w:rFonts':
  370. if ($node->getAttribute('w:ascii') == 'Symbol') {
  371. $symbolDecode = true;
  372. }
  373. break;
  374. }
  375. }
  376. }
  377.  
  378. // Determine tags to close
  379. $needToClose = [];
  380. if ($currItalic and !$newItalic) $needToClose[] = 'i';
  381. if ($currBold and !$newBold) $needToClose[] = 'b';
  382. if ($currSubscript and !$newSubscript) $needToClose[] = 'sub';
  383. if ($currSuperscript and !$newSuperscript) $needToClose[] = 'sup';
  384.  
  385. // Close the whole tag stack, then reopen any which are meant to be open
  386. if (count($needToClose)) {
  387. $reopen = [];
  388. while ($tag = array_pop($tagStack)) {
  389. if (!in_array($tag, $needToClose)) $reopen[] = $tag;
  390. $out .= '</' . $tag . '>';
  391. }
  392.  
  393. foreach ($reopen as $tag) {
  394. $out .= '<' . $tag . '>';
  395. }
  396.  
  397. $tagStack = $reopen;
  398. }
  399.  
  400. // Open new tags
  401. if (!$currBold and $newBold) { $out .= '<b>'; $tagStack[] = 'b'; }
  402. if (!$currItalic and $newItalic) { $out .= '<i>'; $tagStack[] = 'i'; }
  403. if (!$currSubscript and $newSubscript) { $out .= '<sub>'; $tagStack[] = 'sub'; }
  404. if (!$currSuperscript and $newSuperscript) { $out .= '<sup>'; $tagStack[] = 'sup'; }
  405.  
  406. // Update state variables
  407. $currBold = $newBold;
  408. $currItalic = $newItalic;
  409. $currSubscript = $newSubscript;
  410. $currSuperscript = $newSuperscript;
  411.  
  412. // Output the text, br and graphic elements
  413. $texts = $runElem->childNodes;
  414. foreach ($texts as $node) {
  415. if ($node->tagName == 'w:t') {
  416. if ($symbolDecode) {
  417. $out .= Enc::xml($this->symbolSanitizeString($node->firstChild->data));
  418. } else {
  419. $out .= Enc::xml($node->firstChild->data);
  420. }
  421.  
  422. } else if ($node->tagName == 'w:drawing') {
  423. $out .= $this->drawing($node);
  424.  
  425. } else if ($node->tagName == 'w:pict') {
  426. $out .= $this->pict($node);
  427.  
  428. } else if ($node->tagName == 'w:br') {
  429. $out .= '<br/>';
  430.  
  431. } else if ($node->tagName == 'w:tab') {
  432. $out .= "\t";
  433. }
  434. }
  435. }
  436.  
  437. // Close any remaining tags
  438. while ($tag = array_pop($tagStack)) {
  439. $out .= '</' . $tag . '>';
  440. }
  441.  
  442. // Clean up styled words with unstyled spaces
  443. $out = preg_replace('!</b>(\s*)<b>!', '$1', $out);
  444. $out = preg_replace('!</i>(\s*)<i>!', '$1', $out);
  445.  
  446. // Clean up unstyled words with styled spaces
  447. $out = preg_replace('!<b>(\s*)</b>!', '$1', $out);
  448. $out = preg_replace('!<i>(\s*)</i>!', '$1', $out);
  449.  
  450. // Remove multiple BRs in a row
  451. $out = preg_replace('!<br/>(<br/>)+!', '<br/>', $out);
  452.  
  453. // Move BRs outside B and I tags
  454. $out = preg_replace('!<br/></([bi])>!', '</$1><br/>', $out);
  455. $out = preg_replace('!<([bi])><br/>!', '<br/><$1>', $out);
  456.  
  457. // Remove trailing and leading BRs
  458. $out = preg_replace('!^<br/>!', '', $out);
  459. $out = preg_replace('!<br/>$!', '', $out);
  460.  
  461. return $out;
  462. }
  463.  
  464.  
  465. /**
  466.   * Load the number formats from numbering.xml
  467.   *
  468.   * @return array
  469.   */
  470. private function loadFormats()
  471. {
  472. $out = [];
  473.  
  474. if (! $this->zip->statName('word/numbering.xml')) return [];
  475.  
  476. $doc = new DOMDocument();
  477. $doc->loadXML($this->zip->getFromName('word/numbering.xml'));
  478.  
  479. $tmp = [];
  480. $abstractnums = $doc->getElementsByTagName('abstractNum');
  481. foreach ($abstractnums as $elem) {
  482. $id = $elem->getAttribute('w:abstractNumId');
  483. $abstractnum = [];
  484.  
  485. $e = $elem->getElementsByTagName('numFmt');
  486. if ($e->length) {
  487. $abstractnum['numFmt'] = $e->item(0)->getAttribute('w:val');
  488. }
  489.  
  490. $e = $elem->getElementsByTagName('numStyleLink');
  491. if ($e->length) {
  492. $abstractnum['styleName'] = $e->item(0)->getAttribute('w:val');
  493. }
  494.  
  495. $tmp[$id] = $abstractnum;
  496. }
  497.  
  498. $nums = $doc->getElementsByTagName('num');
  499. foreach ($nums as $elem) {
  500. $id = $elem->getAttribute('w:numId');
  501.  
  502. $e = $elem->getElementsByTagName('abstractNumId');
  503. $e = $e->item(0)->getAttribute('w:val');
  504. if (! isset($tmp[$e])) continue;
  505.  
  506. $out[$id] = $tmp[$e];
  507. }
  508.  
  509. return $out;
  510. }
  511.  
  512.  
  513. /**
  514.   * Load styles
  515.   *
  516.   * @return array
  517.   */
  518. private function loadStyles()
  519. {
  520. $out = [];
  521.  
  522. if (! $this->zip->statName('word/styles.xml')) return [];
  523.  
  524. $doc = new DOMDocument();
  525. $doc->loadXML($this->zip->getFromName('word/styles.xml'));
  526.  
  527. $elems = $doc->getElementsByTagName('style');
  528. foreach ($elems as $elem) {
  529. $id = $elem->getAttribute('w:styleId');
  530.  
  531. $out[$id] = [
  532. 'name' => $elem->getElementsByTagName('name')->item(0)->getAttribute('w:val'),
  533. ];
  534.  
  535. // Numbering style
  536. $numid = $elem->getElementsByTagName('numId');
  537. if ($numid->length) {
  538. $out[$id]['numid'] = $numid->item(0)->getAttribute('w:val');
  539. }
  540.  
  541. // Bold tag
  542. $bold = $elem->getElementsByTagName('b');
  543. if ($bold->length != 0 and $bold->item(0)->getAttribute('w:val') !== 'false' and $bold->item(0)->getAttribute('w:val') !== '0') {
  544. $out[$id]['bold'] = true;
  545. }
  546.  
  547. // Italic tag
  548. $italic = $elem->getElementsByTagName('i');
  549. if ($italic->length != 0 and $italic->item(0)->getAttribute('w:val') !== 'false' and $italic->item(0)->getAttribute('w:val') !== '0') {
  550. $out[$id]['italic'] = true;
  551. }
  552.  
  553. // Base style
  554. $base = $elem->getElementsByTagName('basedOn');
  555. if ($base->length) {
  556. $out[$id]['based_on_id'] = $base->item(0)->getAttribute('w:val');
  557. }
  558. }
  559.  
  560. foreach ($out as $index => $row) {
  561. if (isset($row['based_on_id'])) {
  562. $out[$index]['based_on_names'] = $this->flattenBasedOnTree($out, $row);
  563. }
  564. }
  565.  
  566. return $out;
  567. }
  568.  
  569.  
  570. /**
  571.   * Walk the chain of styles via the "based on" field to generate a list of names
  572.   *
  573.   * @param array $styles
  574.   * @param array $heading
  575.   * @return array List of names
  576.   */
  577. private function flattenBasedOnTree(&$styles, $heading)
  578. {
  579. if (isset($heading['based_on_id'])) {
  580. if (isset($styles[$heading['based_on_id']])) {
  581. $parent = $styles[$heading['based_on_id']];
  582. $chain = $this->flattenBasedOnTree($styles, $parent);
  583. $chain[] = $parent['name'];
  584. return $chain;
  585. }
  586. }
  587.  
  588. return null;
  589. }
  590.  
  591.  
  592. /**
  593.   * Sometimes a numbering format refers to a style, the style itself contains the actual number format
  594.   * This function dereferences the number formats back again
  595.   *
  596.   * @return void
  597.   */
  598. private function numbersFromStyles()
  599. {
  600. foreach ($this->number_formats as $idx => &$num) {
  601. if (isset($num['styleName'])) {
  602. $style = $this->styles[$num['styleName']];
  603. if (! $style) continue;
  604.  
  605. $numId = $style['numid'];
  606. if (! $numId) continue;
  607.  
  608. $upstreamFormat = $this->number_formats[$numId];
  609. if (! $upstreamFormat) continue;
  610.  
  611. $num['numFmt'] = $upstreamFormat['numFmt'];
  612. }
  613. }
  614. }
  615.  
  616.  
  617. /**
  618.   * Relationships is how the main document.xml links together with various media files etc
  619.   *
  620.   * @return array
  621.   */
  622. private function loadRelationships()
  623. {
  624. $out = [];
  625.  
  626. $doc = new DOMDocument();
  627. $doc->loadXML($this->zip->getFromName('word/_rels/document.xml.rels'));
  628.  
  629. $elems = $doc->getElementsByTagName('Relationship');
  630. foreach ($elems as $elem) {
  631. $id = $elem->getAttribute('Id');
  632. $target = $elem->getAttribute('Target');
  633. $out[$id] = $target;
  634. }
  635.  
  636. return $out;
  637. }
  638.  
  639.  
  640. /**
  641.   * For a given paragraph element, determine the finalised style in use
  642.   *
  643.   * @param DOMElement $elem
  644.   * @return array
  645.   */
  646. private function determineStyle($elem) {
  647. $out = [];
  648. $out['style'] = null;
  649. $out['style_name'] = null;
  650. $out['based_on'] = null;
  651. $out['number_format'] = null;
  652. $out['number_level'] = 0;
  653.  
  654. // Get style id and name
  655. $style = $elem->getElementsByTagName('pStyle');
  656. if ($style->length) {
  657. $out['style'] = $style->item(0)->getAttribute('w:val');
  658. $out['style_name'] = $this->styles[$out['style']]['name'];
  659. $out['based_on'] = @$this->styles[$out['style']]['based_on_names'];
  660. }
  661.  
  662. // Apply details from the style
  663. if (isset($this->styles[$out['style']])) {
  664. $style = $this->styles[$out['style']];
  665.  
  666. if (isset($style['numid'])) {
  667. $out['number_format'] = $this->number_formats[$style['numid']]['numFmt'];
  668. }
  669. }
  670.  
  671. // Apply local numbering
  672. $num = $elem->getElementsByTagName('numPr');
  673. if ($num->length) {
  674. $id = $num->item(0)->getElementsByTagName('numId');
  675. if ($id->length) {
  676. $numberId = $id->item(0)->getAttribute('w:val');
  677. if (isset($this->number_formats[$numberId]['numFmt'])) {
  678. $out['number_format'] = $this->number_formats[$numberId]['numFmt'];
  679. }
  680. }
  681.  
  682. $id = $num->item(0)->getElementsByTagName('ilvl');
  683. if ($id->length) {
  684. $out['number_level'] = $id->item(0)->getAttribute('w:val');
  685. }
  686. }
  687.  
  688. // If this is a heading style with numbering, kill the numbering
  689. $expected_tag = $this->determineParaTag($out);
  690. if ($expected_tag[0] == 'h') {
  691. $out['number_format'] = null;
  692. $out['number_level'] = 0;
  693. }
  694.  
  695. // If this uses the numberfing format "none", kill the numbering
  696. if ($out['number_format'] == 'none') {
  697. $out['number_format'] = null;
  698. $out['number_level'] = 0;
  699. }
  700.  
  701. return $out;
  702. }
  703.  
  704.  
  705. /**
  706.   * For a given style tag, return a paragraph tag (either 'p' or 'h1', 'h2', etc)
  707.   *
  708.   * @param array $style
  709.   * @return string Tag name
  710.   */
  711. private function determineParaTag($style)
  712. {
  713. $name = strtolower($style['style_name']);
  714.  
  715. // If the style itself is a heading
  716. if (strpos($name, 'heading 1') === 0) return 'h1';
  717. if (strpos($name, 'heading 2') === 0) return 'h2';
  718. if (strpos($name, 'heading 3') === 0) return 'h3';
  719. if (strpos($name, 'heading 4') === 0) return 'h4';
  720. if (strpos($name, 'heading 5') === 0) return 'h5';
  721. if (strpos($name, 'heading 6') === 0) return 'h6';
  722.  
  723. // If one of the styles it's based on is a heading
  724. if (is_array($style['based_on'])) {
  725. foreach ($style['based_on'] as $name) {
  726. $name = strtolower($name);
  727. if (strpos($name, 'heading 1') === 0) return 'h1';
  728. if (strpos($name, 'heading 2') === 0) return 'h2';
  729. if (strpos($name, 'heading 3') === 0) return 'h3';
  730. if (strpos($name, 'heading 4') === 0) return 'h4';
  731. if (strpos($name, 'heading 5') === 0) return 'h5';
  732. if (strpos($name, 'heading 6') === 0) return 'h6';
  733. }
  734. }
  735.  
  736. return 'p';
  737. }
  738.  
  739.  
  740. /**
  741.   * For a given style tag, return a list tag (either 'ul' or 'ol')
  742.   *
  743.   * @param array $style
  744.   * @return string Tag name
  745.   */
  746. private function determineListTag($style)
  747. {
  748. if ($style['number_format'] == 'bullet') return 'ul';
  749. return 'ol';
  750. }
  751.  
  752.  
  753. /**
  754.   * Render a w:drawing object, i.e. an image
  755.   *
  756.   * @param DOMElement $elem
  757.   * @return string HTML img tag
  758.   */
  759. private function drawing($elem)
  760. {
  761. $graphic = $elem->getElementsByTagName('graphic');
  762. if (! $graphic->length) return;
  763. $graphic = $graphic->item(0);
  764.  
  765. $blip = $graphic->getElementsByTagName('blip');
  766. if (! $blip->length) return;
  767. $id = $blip->item(0)->getAttribute('r:embed');
  768.  
  769. // Check resource exists
  770. $stat = $this->zip->statName('word/' . $this->relationships[$id]);
  771. if (! $stat) return;
  772.  
  773. // Get image size props
  774. $ext = $graphic->getElementsByTagName('ext')->item(0);
  775. $sizeX = $this->EMUtoPX($ext->getAttribute('cx'));
  776. $sizeY = $this->EMUtoPX($ext->getAttribute('cy'));
  777.  
  778. // Check ext
  779. $resname = basename($this->relationships[$id]);
  780. $fileext = strtolower(File::getExt(trim($resname)));
  781. if (!in_array($fileext, ['jpg', 'jpeg', 'gif', 'png'])) {
  782. return '<img error="unsupported-type" res="' . $resname . '" width="' . round($sizeX) . '" height="' . round($sizeY) . '" />';
  783. }
  784.  
  785. // Load resource
  786. if (empty($this->res[$resname])) {
  787. $this->res[$resname] = $this->zip->getFromName('word/' . $this->relationships[$id]);
  788. }
  789.  
  790. return '<img rel="' . $resname . '" width="' . round($sizeX, 1) . '" height="' . round($sizeY, 1) . '" />';
  791. }
  792.  
  793.  
  794. /**
  795.   * Render a w:pict object, i.e. an image
  796.   *
  797.   * @param DOMElement $elem
  798.   * @return string HTML img tag
  799.   */
  800. private function pict($elem)
  801. {
  802. $shape = $elem->getElementsByTagName('shape');
  803. if (! $shape->length) return;
  804. $shape = $shape->item(0);
  805.  
  806. $imagedata = $shape->getElementsByTagName('imagedata');
  807. if (! $imagedata->length) return;
  808. $id = $imagedata->item(0)->getAttribute('r:id');
  809.  
  810. // Check resource exists
  811. $stat = $this->zip->statName('word/' . $this->relationships[$id]);
  812. if (! $stat) return;
  813.  
  814. // Get image size props
  815. $css = $shape->getAttribute('style');
  816. $css = $this->parseCss($css);
  817. if (preg_match('/[0-9]+/', $css['width'], $matches)) $sizeX = $matches[0];
  818. if (preg_match('/[0-9]+/', $css['height'], $matches)) $sizeY = $matches[0];
  819.  
  820. // Check ext
  821. $resname = basename($this->relationships[$id]);
  822. $fileext = strtolower(File::getExt(trim($resname)));
  823. if (!in_array($fileext, ['jpg', 'jpeg', 'gif', 'png'])) {
  824. return '<img error="unsupported-type" res="' . $resname . '" width="' . round($sizeX) . '" height="' . round($sizeY) . '" />';
  825. }
  826.  
  827. // Load resource
  828. if (empty($this->res[$resname])) {
  829. $this->res[$resname] = $this->zip->getFromName('word/' . $this->relationships[$id]);
  830. }
  831.  
  832. return '<img rel="' . $resname . '" width="' . round($sizeX, 1) . '" height="' . round($sizeY, 1) . '" />';
  833. }
  834.  
  835.  
  836. /**
  837.   * Convert 'Symbol' font Private-Use-Area characters into real characters
  838.   *
  839.   * @param string $string
  840.   * @return string
  841.   */
  842. public function symbolSanitizeString($string)
  843. {
  844. '/([\x{f020}-\x{f0fe}]{1})/u',
  845. [$this, 'symbolUnicodeToUtf8Entity'],
  846. $string
  847. );
  848. }
  849.  
  850.  
  851. /**
  852.   * Regular expression callback for Symbol font conversion
  853.   *
  854.   * @param string $wchar
  855.   * @return string
  856.   */
  857. public function symbolUnicodeToUtf8Entity($wchar)
  858. {
  859. $conv = hexdec(bin2hex($wchar[1]));
  860. $charcode = self::$symbol_font_map[$conv];
  861. return ($charcode ? mb_convert_encoding('&#' . intval($charcode) . ';', 'UTF-8', 'HTML-ENTITIES') : '?');
  862. }
  863.  
  864.  
  865. /**
  866.   * Parse given css
  867.   *
  868.   * @param string $css
  869.   * @return array
  870.   */
  871. private function parseCss($css)
  872. {
  873. $out = [];
  874.  
  875. $rules = explode(';', $css);
  876. foreach ($rules as $r) {
  877. list($key, $val) = explode(':', $r, 2);
  878.  
  879. if ($key and $val) {
  880. $out[trim($key)] = trim($val);
  881. }
  882. }
  883.  
  884. return $out;
  885. }
  886.  
  887.  
  888. /**
  889.   * EM units to pixels
  890.   *
  891.   * @param int|float $emu
  892.   * @param int $dpi
  893.   * @return float Pixel value
  894.   */
  895. private function EMUtoPX($emu, $dpi = 72)
  896. {
  897. return $emu / 914400 * $dpi;
  898. }
  899.  
  900.  
  901. /**
  902.   * Mapping between PUA for Symbol font to regular characters
  903.   *
  904.   * Key - UTF-8 encoded bytes
  905.   * Value - Widechar bytes
  906.   **/
  907. static $symbol_font_map = [
  908. 15696032 => 32,
  909. 15696033 => 33,
  910. 15696034 => 8704,
  911. 15696035 => 35,
  912. 15696036 => 8707,
  913. 15696037 => 37,
  914. 15696038 => 38,
  915. 15696039 => 8715,
  916. 15696040 => 40,
  917. 15696041 => 41,
  918. 15696042 => 8727,
  919. 15696043 => 43,
  920. 15696044 => 44,
  921. 15696045 => 8722,
  922. 15696046 => 46,
  923. 15696047 => 47,
  924. 15696048 => 48,
  925. 15696049 => 49,
  926. 15696050 => 50,
  927. 15696051 => 51,
  928. 15696052 => 52,
  929. 15696053 => 53,
  930. 15696054 => 54,
  931. 15696055 => 55,
  932. 15696056 => 56,
  933. 15696057 => 57,
  934. 15696058 => 58,
  935. 15696059 => 59,
  936. 15696060 => 60,
  937. 15696061 => 61,
  938. 15696062 => 62,
  939. 15696063 => 63,
  940. 15696256 => 8773,
  941. 15696257 => 913,
  942. 15696258 => 914,
  943. 15696259 => 935,
  944. 15696260 => 916,
  945. 15696261 => 917,
  946. 15696262 => 934,
  947. 15696263 => 915,
  948. 15696264 => 919,
  949. 15696265 => 921,
  950. 15696266 => 977,
  951. 15696267 => 922,
  952. 15696268 => 923,
  953. 15696269 => 924,
  954. 15696270 => 925,
  955. 15696271 => 927,
  956. 15696272 => 928,
  957. 15696273 => 920,
  958. 15696274 => 929,
  959. 15696275 => 931,
  960. 15696276 => 932,
  961. 15696277 => 933,
  962. 15696278 => 962,
  963. 15696279 => 937,
  964. 15696280 => 926,
  965. 15696281 => 936,
  966. 15696282 => 918,
  967. 15696283 => 91,
  968. 15696284 => 8756,
  969. 15696285 => 93,
  970. 15696286 => 8869,
  971. 15696287 => 95,
  972. 15696288 => 63717,
  973. 15696289 => 945,
  974. 15696290 => 946,
  975. 15696291 => 967,
  976. 15696292 => 948,
  977. 15696293 => 949,
  978. 15696294 => 966,
  979. 15696295 => 947,
  980. 15696296 => 951,
  981. 15696297 => 953,
  982. 15696298 => 981,
  983. 15696299 => 954,
  984. 15696300 => 955,
  985. 15696301 => 956,
  986. 15696302 => 957,
  987. 15696303 => 959,
  988. 15696304 => 960,
  989. 15696305 => 952,
  990. 15696306 => 961,
  991. 15696307 => 963,
  992. 15696308 => 964,
  993. 15696309 => 965,
  994. 15696310 => 982,
  995. 15696311 => 969,
  996. 15696312 => 958,
  997. 15696313 => 968,
  998. 15696314 => 950,
  999. 15696315 => 123,
  1000. 15696316 => 124,
  1001. 15696317 => 125,
  1002. 15696318 => 8764,
  1003. 15696544 => 8364,
  1004. 15696545 => 978,
  1005. 15696546 => 8242,
  1006. 15696547 => 8804,
  1007. 15696548 => 8260,
  1008. 15696549 => 8734,
  1009. 15696550 => 402,
  1010. 15696551 => 9827,
  1011. 15696552 => 9830,
  1012. 15696553 => 9829,
  1013. 15696554 => 9824,
  1014. 15696555 => 8596,
  1015. 15696556 => 8592,
  1016. 15696557 => 8593,
  1017. 15696558 => 8594,
  1018. 15696559 => 8595,
  1019. 15696560 => 176,
  1020. 15696561 => 177,
  1021. 15696562 => 8243,
  1022. 15696563 => 8805,
  1023. 15696564 => 215,
  1024. 15696565 => 8733,
  1025. 15696566 => 8706,
  1026. 15696567 => 8226,
  1027. 15696568 => 247,
  1028. 15696569 => 8800,
  1029. 15696570 => 8801,
  1030. 15696571 => 8776,
  1031. 15696572 => 8230,
  1032. 15696573 => 63718,
  1033. 15696574 => 63719,
  1034. 15696575 => 8629,
  1035. 15696768 => 8501,
  1036. 15696769 => 8465,
  1037. 15696770 => 8476,
  1038. 15696771 => 8472,
  1039. 15696772 => 8855,
  1040. 15696773 => 8853,
  1041. 15696774 => 8709,
  1042. 15696775 => 8745,
  1043. 15696776 => 8746,
  1044. 15696777 => 8835,
  1045. 15696778 => 8839,
  1046. 15696779 => 8836,
  1047. 15696780 => 8834,
  1048. 15696781 => 8838,
  1049. 15696782 => 8712,
  1050. 15696783 => 8713,
  1051. 15696784 => 8736,
  1052. 15696785 => 8711,
  1053. 15696786 => 63194,
  1054. 15696787 => 63193,
  1055. 15696788 => 63195,
  1056. 15696789 => 8719,
  1057. 15696790 => 8730,
  1058. 15696791 => 8901,
  1059. 15696792 => 172,
  1060. 15696793 => 8743,
  1061. 15696794 => 8744,
  1062. 15696795 => 8660,
  1063. 15696796 => 8656,
  1064. 15696797 => 8657,
  1065. 15696798 => 8658,
  1066. 15696799 => 8659,
  1067. 15696800 => 9674,
  1068. 15696801 => 9001,
  1069. 15696802 => 63720,
  1070. 15696803 => 63721,
  1071. 15696804 => 63722,
  1072. 15696805 => 8721,
  1073. 15696806 => 63723,
  1074. 15696807 => 63724,
  1075. 15696808 => 63725,
  1076. 15696809 => 63726,
  1077. 15696810 => 63727,
  1078. 15696811 => 63728,
  1079. 15696812 => 63729,
  1080. 15696813 => 63730,
  1081. 15696814 => 63731,
  1082. 15696815 => 63732,
  1083. 15696817 => 9002,
  1084. 15696818 => 8747,
  1085. 15696819 => 8992,
  1086. 15696820 => 63733,
  1087. 15696821 => 8993,
  1088. 15696822 => 63734,
  1089. 15696823 => 63735,
  1090. 15696824 => 63736,
  1091. 15696825 => 63737,
  1092. 15696826 => 63738,
  1093. 15696827 => 63739,
  1094. 15696828 => 63740,
  1095. 15696829 => 63741,
  1096. 15696830 => 63742,
  1097. ];
  1098. }
  1099.  
  1100.  
  1101. class DocImportDOCXRun
  1102. {
  1103. public $elem;
  1104. public $hyperlink;
  1105. public $rendered;
  1106.  
  1107.  
  1108. /**
  1109.   * Constructor
  1110.   *
  1111.   * @param DOMElement $elem
  1112.   */
  1113. public function __construct($elem)
  1114. {
  1115. $this->elem = $elem;
  1116. }
  1117. }
  1118.