Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 114 |
|
0.00% |
0 / 6 |
CRAP | |
0.00% |
0 / 1 |
VuFindSitemap | |
0.00% |
0 / 114 |
|
0.00% |
0 / 6 |
812 | |
0.00% |
0 / 1 |
getApertureFields | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
56 | |||
getTikaFields | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
30 | |||
getHtmlFields | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
30 | |||
arrayToSolrXml | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
getDocument | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getDocumentFieldArray | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
30 |
1 | <?php |
2 | |
3 | /** |
4 | * XSLT importer support methods for sitemaps. |
5 | * |
6 | * PHP version 8 |
7 | * |
8 | * Copyright (c) Demian Katz 2010. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License version 2, |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | * @category VuFind |
24 | * @package Import_Tools |
25 | * @author Demian Katz <demian.katz@villanova.edu> |
26 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
27 | * @link https://vufind.org/wiki/indexing Wiki |
28 | */ |
29 | |
30 | namespace VuFind\XSLT\Import; |
31 | |
32 | use function chr; |
33 | use function is_array; |
34 | |
35 | /** |
36 | * XSLT support class -- all methods of this class must be public and static; |
37 | * they will be automatically made available to your XSL stylesheet for use |
38 | * with the php:function() function. |
39 | * |
40 | * @category VuFind |
41 | * @package Import_Tools |
42 | * @author Demian Katz <demian.katz@villanova.edu> |
43 | * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License |
44 | * @link https://vufind.org/wiki/indexing Wiki |
45 | */ |
46 | class VuFindSitemap extends VuFind |
47 | { |
48 | /** |
49 | * Load metadata about an HTML document using Aperture. |
50 | * |
51 | * @param string $htmlFile File on disk containing HTML. |
52 | * |
53 | * @return array |
54 | */ |
55 | protected static function getApertureFields($htmlFile) |
56 | { |
57 | $xmlFile = tempnam('/tmp', 'apt'); |
58 | $cmd = static::getApertureCommand($htmlFile, $xmlFile, 'filecrawler'); |
59 | exec($cmd); |
60 | |
61 | // If we failed to process the file, give up now: |
62 | if (!file_exists($xmlFile)) { |
63 | throw new \Exception('Aperture failed.'); |
64 | } |
65 | |
66 | // Extract and decode the full text from the XML: |
67 | $xml = str_replace(chr(0), ' ', file_get_contents($xmlFile)); |
68 | @unlink($xmlFile); |
69 | preg_match('/<plainTextContent[^>]*>([^<]*)</ms', $xml, $matches); |
70 | $final = isset($matches[1]) ? |
71 | trim(html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8')) : ''; |
72 | |
73 | // Extract the title from the XML: |
74 | preg_match('/<title[^>]*>([^<]*)</ms', $xml, $matches); |
75 | $title = isset($matches[1]) ? |
76 | trim(html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8')) : ''; |
77 | |
78 | // Extract the keywords from the XML: |
79 | preg_match_all('/<keyword[^>]*>([^<]*)</ms', $xml, $matches); |
80 | $keywords = []; |
81 | if (isset($matches[1])) { |
82 | foreach ($matches[1] as $current) { |
83 | $keywords[] |
84 | = trim(html_entity_decode($current, ENT_QUOTES, 'UTF-8')); |
85 | } |
86 | } |
87 | |
88 | // Extract the description from the XML: |
89 | preg_match('/<description[^>]*>([^<]*)</ms', $xml, $matches); |
90 | $description = isset($matches[1]) |
91 | ? trim(html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8')) : ''; |
92 | |
93 | // Send back the extracted fields: |
94 | return [ |
95 | 'title' => $title, |
96 | 'keywords' => $keywords, |
97 | 'description' => $description, |
98 | 'fulltext' => $final, |
99 | ]; |
100 | } |
101 | |
102 | /** |
103 | * Load metadata about an HTML document using Tika. |
104 | * |
105 | * @param string $htmlFile File on disk containing HTML. |
106 | * |
107 | * @return array |
108 | */ |
109 | protected static function getTikaFields($htmlFile) |
110 | { |
111 | // Extract and decode the full text from the XML: |
112 | $xml = static::harvestWithTika($htmlFile, '--xml'); |
113 | |
114 | // Extract the title from the XML: |
115 | preg_match('/<title[^>]*>([^<]*)</ms', $xml, $matches); |
116 | $title = isset($matches[1]) ? |
117 | html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; |
118 | |
119 | // Extract the keywords from the XML: |
120 | preg_match_all( |
121 | '/<meta name="keywords" content="([^"]*)"/ms', |
122 | $xml, |
123 | $matches |
124 | ); |
125 | $keywords = []; |
126 | if (isset($matches[1])) { |
127 | foreach ($matches[1] as $current) { |
128 | $keywords[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); |
129 | } |
130 | } |
131 | |
132 | // Extract the description from the XML: |
133 | preg_match('/<meta name="description" content="([^"]*)"/ms', $xml, $matches); |
134 | $description = isset($matches[1]) |
135 | ? html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; |
136 | |
137 | // Send back the extracted fields: |
138 | return [ |
139 | 'title' => $title, |
140 | 'keywords' => $keywords, |
141 | 'description' => $description, |
142 | 'fulltext' => $title . ' ' . static::harvestWithTika($htmlFile), |
143 | ]; |
144 | } |
145 | |
146 | /** |
147 | * Extract key metadata from HTML. |
148 | * |
149 | * NOTE: This method uses some non-standard meta tags; it is intended as an |
150 | * example that can be overridden/extended to support local practices. |
151 | * |
152 | * @param string $html HTML content. |
153 | * |
154 | * @return array |
155 | */ |
156 | protected static function getHtmlFields($html) |
157 | { |
158 | // Extract the subjects from the HTML: |
159 | preg_match_all( |
160 | '/<meta name="subject" content="([^"]*)"/ms', |
161 | $html, |
162 | $matches |
163 | ); |
164 | $subjects = []; |
165 | if (isset($matches[1])) { |
166 | foreach ($matches[1] as $current) { |
167 | $subjects[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); |
168 | } |
169 | } |
170 | |
171 | // Extract the link types from the HTML: |
172 | preg_match_all( |
173 | '/<meta name="category" content="([^"]*)"/ms', |
174 | $html, |
175 | $matches |
176 | ); |
177 | $categories = []; |
178 | if (isset($matches[1])) { |
179 | foreach ($matches[1] as $current) { |
180 | $categories[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); |
181 | } |
182 | } |
183 | |
184 | // Extract the use count from the HTML: |
185 | preg_match_all( |
186 | '/<meta name="useCount" content="([^"]*)"/ms', |
187 | $html, |
188 | $matches |
189 | ); |
190 | $useCount = $matches[1][0] ?? 1; |
191 | |
192 | return [ |
193 | 'category' => $categories, |
194 | 'subject' => $subjects, |
195 | 'use_count' => $useCount, |
196 | ]; |
197 | } |
198 | |
199 | /** |
200 | * Convert an associative array of fields into a Solr document. |
201 | * |
202 | * @param array $fields Field data |
203 | * |
204 | * @return string |
205 | */ |
206 | public static function arrayToSolrXml($fields) |
207 | { |
208 | $xml = ''; |
209 | foreach ($fields as $key => $value) { |
210 | $value = is_array($value) ? $value : [$value]; |
211 | foreach ($value as $current) { |
212 | if (!empty($current)) { |
213 | $xml .= '<field name="' . $key . '">' |
214 | . htmlspecialchars($current) . '</field>'; |
215 | } |
216 | } |
217 | } |
218 | return $xml; |
219 | } |
220 | |
221 | /** |
222 | * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. |
223 | * This method will only work if Aperture is properly configured in the |
224 | * web/conf/fulltext.ini file. Without proper configuration, this will |
225 | * simply return an empty string. |
226 | * |
227 | * @param string $url URL of file to retrieve. |
228 | * |
229 | * @return string text contents of file. |
230 | */ |
231 | public static function getDocument($url) |
232 | { |
233 | // Turn the array into XML: |
234 | return static::arrayToSolrXml(static::getDocumentFieldArray($url)); |
235 | } |
236 | |
237 | /** |
238 | * Support method for getDocument() -- retrieve associative array of field data. |
239 | * |
240 | * @param string $url URL of file to retrieve. |
241 | * |
242 | * @return array |
243 | */ |
244 | protected static function getDocumentFieldArray($url) |
245 | { |
246 | $parser = static::getParser(); |
247 | if ($parser == 'None') { |
248 | return []; |
249 | } |
250 | |
251 | // Grab the HTML and write it to disk: |
252 | $htmlFile = tempnam('/tmp', 'htm'); |
253 | $html = file_get_contents($url); |
254 | file_put_contents($htmlFile, $html); |
255 | |
256 | // Use the appropriate full text parser: |
257 | switch ($parser) { |
258 | case 'Aperture': |
259 | $fields = static::getApertureFields($htmlFile); |
260 | break; |
261 | case 'Tika': |
262 | $fields = static::getTikaFields($htmlFile); |
263 | break; |
264 | default: |
265 | throw new \Exception('Unexpected parser: ' . $parser); |
266 | } |
267 | |
268 | // Clean up HTML file: |
269 | @unlink($htmlFile); |
270 | |
271 | // Add data loaded directly from HTML: |
272 | $fields += static::getHtmlFields($html); |
273 | |
274 | // Clean up/normalize full text: |
275 | $fields['fulltext'] = trim( |
276 | preg_replace( |
277 | '/\s+/', |
278 | ' ', |
279 | static::stripBadChars($fields['fulltext']) |
280 | ) |
281 | ); |
282 | |
283 | // Use a hash of the URL for the ID: |
284 | $fields['id'] = md5($url); |
285 | |
286 | // Add other key values: |
287 | $fields['url'] = $url; |
288 | $fields['last_indexed'] = date('Y-m-d\TH:i:s\Z'); |
289 | |
290 | return $fields; |
291 | } |
292 | } |