Context Navigation

source: branches/version-2_5-dev/data/module/fpdf/pdf_parser.php @ 19716

Visit:

Revision 19716, 23.1 KB checked in by Seasoft, 16 years ago (diff)

#403(インクルードしているライブラリ群をバージョンアップする)

FPDF関連 (PHP4について、動作環境無く未確認)

Line
1	<?php
2	//
3	// FPDI - Version 1.4
4	//
5	// Copyright 2004-2010 Setasign - Jan Slabon
6	//
7	// Licensed under the Apache License, Version 2.0 (the "License");
8	// you may not use this file except in compliance with the License.
9	// You may obtain a copy of the License at
10	//
11	// http://www.apache.org/licenses/LICENSE-2.0
12	//
13	// Unless required by applicable law or agreed to in writing, software
14	// distributed under the License is distributed on an "AS IS" BASIS,
15	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16	// See the License for the specific language governing permissions and
17	// limitations under the License.
18	//
19
20	if (!defined ('PDF_TYPE_NULL'))
21	define ('PDF_TYPE_NULL', 0);
22	if (!defined ('PDF_TYPE_NUMERIC'))
23	define ('PDF_TYPE_NUMERIC', 1);
24	if (!defined ('PDF_TYPE_TOKEN'))
25	define ('PDF_TYPE_TOKEN', 2);
26	if (!defined ('PDF_TYPE_HEX'))
27	define ('PDF_TYPE_HEX', 3);
28	if (!defined ('PDF_TYPE_STRING'))
29	define ('PDF_TYPE_STRING', 4);
30	if (!defined ('PDF_TYPE_DICTIONARY'))
31	define ('PDF_TYPE_DICTIONARY', 5);
32	if (!defined ('PDF_TYPE_ARRAY'))
33	define ('PDF_TYPE_ARRAY', 6);
34	if (!defined ('PDF_TYPE_OBJDEC'))
35	define ('PDF_TYPE_OBJDEC', 7);
36	if (!defined ('PDF_TYPE_OBJREF'))
37	define ('PDF_TYPE_OBJREF', 8);
38	if (!defined ('PDF_TYPE_OBJECT'))
39	define ('PDF_TYPE_OBJECT', 9);
40	if (!defined ('PDF_TYPE_STREAM'))
41	define ('PDF_TYPE_STREAM', 10);
42	if (!defined ('PDF_TYPE_BOOLEAN'))
43	define ('PDF_TYPE_BOOLEAN', 11);
44	if (!defined ('PDF_TYPE_REAL'))
45	define ('PDF_TYPE_REAL', 12);
46
47	require_once('pdf_context.php');
48
49	if (!class_exists('pdf_parser', false)) {
50
51	class pdf_parser {
52
53	/**
54	* Filename
55	* @var string
56	*/
57	var $filename;
58
59	/**
60	* File resource
61	* @var resource
62	*/
63	var $f;
64
65	/**
66	* PDF Context
67	* @var object pdf_context-Instance
68	*/
69	var $c;
70
71	/**
72	* xref-Data
73	* @var array
74	*/
75	var $xref;
76
77	/**
78	* root-Object
79	* @var array
80	*/
81	var $root;
82
83	/**
84	* PDF version of the loaded document
85	* @var string
86	*/
87	var $pdfVersion;
88
89	/**
90	* For reading encrypted documents and xref/objectstreams are in use
91	*
92	* @var boolean
93	*/
94	var $readPlain = true;
95
96	/**
97	* Constructor
98	*
99	* @param string $filename Source-Filename
100	*/
101	function pdf_parser($filename) {
102	$this->filename = $filename;
103
104	$this->f = @fopen($this->filename, 'rb');
105
106	if (!$this->f)
107	$this->error(sprintf('Cannot open %s !', $filename));
108
109	$this->getPDFVersion();
110
111	$this->c = new pdf_context($this->f);
112
113	// Read xref-Data
114	$this->xref = array();
115	$this->pdf_read_xref($this->xref, $this->pdf_find_xref());
116
117	// Check for Encryption
118	$this->getEncryption();
119
120	// Read root
121	$this->pdf_read_root();
122	}
123
124	/**
125	* Close the opened file
126	*/
127	function closeFile() {
128	if (isset($this->f) && is_resource($this->f)) {
129	fclose($this->f);
130	unset($this->f);
131	}
132	}
133
134	/**
135	* Print Error and die
136	*
137	* @param string $msg Error-Message
138	*/
139	function error($msg) {
140	die('<b>PDF-Parser Error:</b> '.$msg);
141	}
142
143	/**
144	* Check Trailer for Encryption
145	*/
146	function getEncryption() {
147	if (isset($this->xref['trailer'][1]['/Encrypt'])) {
148	$this->error('File is encrypted!');
149	}
150	}
151
152	/**
153	* Find/Return /Root
154	*
155	* @return array
156	*/
157	function pdf_find_root() {
158	if ($this->xref['trailer'][1]['/Root'][0] != PDF_TYPE_OBJREF) {
159	$this->error('Wrong Type of Root-Element! Must be an indirect reference');
160	}
161
162	return $this->xref['trailer'][1]['/Root'];
163	}
164
165	/**
166	* Read the /Root
167	*/
168	function pdf_read_root() {
169	// read root
170	$this->root = $this->pdf_resolve_object($this->c, $this->pdf_find_root());
171	}
172
173	/**
174	* Get PDF-Version
175	*
176	* And reset the PDF Version used in FPDI if needed
177	*/
178	function getPDFVersion() {
179	fseek($this->f, 0);
180	preg_match('/\d\.\d/',fread($this->f,16),$m);
181	if (isset($m[0]))
182	$this->pdfVersion = $m[0];
183	return $this->pdfVersion;
184	}
185
186	/**
187	* Find the xref-Table
188	*/
189	function pdf_find_xref() {
190	$toRead = 1500;
191
192	$stat = fseek ($this->f, -$toRead, SEEK_END);
193	if ($stat === -1) {
194	fseek ($this->f, 0);
195	}
196	$data = fread($this->f, $toRead);
197
198	$pos = strlen($data) - strpos(strrev($data), strrev('startxref'));
199	$data = substr($data, $pos);
200
201	if (!preg_match('/\s(\d+).$/s', $data, $matches)) {
202	$this->error('Unable to find pointer to xref table');
203	}
204
205	return (int) $matches[1];
206	}
207
208	/**
209	* Read xref-table
210	*
211	* @param array $result Array of xref-table
212	* @param integer $offset of xref-table
213	*/
214	function pdf_read_xref(&$result, $offset) {
215	$o_pos = $offset-min(20, $offset);
216	fseek($this->f, $o_pos); // set some bytes backwards to fetch errorious docs
217
218	$data = fread($this->f, 100);
219
220	$xrefPos = strrpos($data, 'xref');
221
222	if ($xrefPos === false) {
223	fseek($this->f, $offset);
224	$c = new pdf_context($this->f);
225	$xrefStreamObjDec = $this->pdf_read_value($c);
226
227	if (is_array($xrefStreamObjDec) && isset($xrefStreamObjDec[0]) && $xrefStreamObjDec[0] == PDF_TYPE_OBJDEC) {
228	$this->error(sprintf('This document (%s) probably uses a compression technique which is not supported by the free parser shipped with FPDI.', $this->filename));
229	} else {
230	$this->error('Unable to find xref table.');
231	}
232	}
233
234	if (!isset($result['xref_location'])) {
235	$result['xref_location'] = $o_pos+$xrefPos;
236	$result['max_object'] = 0;
237	}
238
239	$cylces = -1;
240	$bytesPerCycle = 100;
241
242	fseek($this->f, $o_pos = $o_pos+$xrefPos+4); // set the handle directly after the "xref"-keyword
243	$data = fread($this->f, $bytesPerCycle);
244
245	while (($trailerPos = strpos($data, 'trailer', max($bytesPerCycle*$cylces++, 0))) === false && !feof($this->f)) {
246	$data .= fread($this->f, $bytesPerCycle);
247	}
248
249	if ($trailerPos === false) {
250	$this->error('Trailer keyword not found after xref table');
251	}
252
253	$data = substr($data, 0, $trailerPos);
254
255	// get Line-Ending
256	preg_match_all("/(\r\n\|\n\|\r)/", substr($data, 0, 100), $m); // check the first 100 bytes for linebreaks
257
258	$differentLineEndings = count(array_unique($m[0]));
259	if ($differentLineEndings > 1) {
260	$lines = preg_split("/(\r\n\|\n\|\r)/", $data, -1, PREG_SPLIT_NO_EMPTY);
261	} else {
262	$lines = explode($m[0][1], $data);
263	}
264
265	$data = $differentLineEndings = $m = null;
266	unset($data, $differentLineEndings, $m);
267
268	$linesCount = count($lines);
269
270	$start = 1;
271
272	for ($i = 0; $i < $linesCount; $i++) {
273	$line = trim($lines[$i]);
274	if ($line) {
275	$pieces = explode(' ', $line);
276	$c = count($pieces);
277	switch($c) {
278	case 2:
279	$start = (int)$pieces[0];
280	$end = $start+(int)$pieces[1];
281	if ($end > $result['max_object'])
282	$result['max_object'] = $end;
283	break;
284	case 3:
285	if (!isset($result['xref'][$start]))
286	$result['xref'][$start] = array();
287
288	if (!array_key_exists($gen = (int) $pieces[1], $result['xref'][$start])) {
289	$result['xref'][$start][$gen] = $pieces[2] == 'n' ? (int) $pieces[0] : null;
290	}
291	$start++;
292	break;
293	default:
294	$this->error('Unexpected data in xref table');
295	}
296	}
297	}
298
299	$lines = $pieces = $line = $start = $end = $gen = null;
300	unset($lines, $pieces, $line, $start, $end, $gen);
301
302	fseek($this->f, $o_pos+$trailerPos+7);
303
304	$c = new pdf_context($this->f);
305	$trailer = $this->pdf_read_value($c);
306
307	$c = null;
308	unset($c);
309
310	if (!isset($result['trailer'])) {
311	$result['trailer'] = $trailer;
312	}
313
314	if (isset($trailer[1]['/Prev'])) {
315	$this->pdf_read_xref($result, $trailer[1]['/Prev'][1]);
316	}
317
318	$trailer = null;
319	unset($trailer);
320
321	return true;
322	}
323
324	/**
325	* Reads an Value
326	*
327	* @param object $c pdf_context
328	* @param string $token a Token
329	* @return mixed
330	*/
331	function pdf_read_value(&$c, $token = null) {
332	if (is_null($token)) {
333	$token = $this->pdf_read_token($c);
334	}
335
336	if ($token === false) {
337	return false;
338	}
339
340	switch ($token) {
341	case '<':
342	// This is a hex string.
343	// Read the value, then the terminator
344
345	$pos = $c->offset;
346
347	while(1) {
348
349	$match = strpos ($c->buffer, '>', $pos);
350
351	// If you can't find it, try
352	// reading more data from the stream
353
354	if ($match === false) {
355	if (!$c->increase_length()) {
356	return false;
357	} else {
358	continue;
359	}
360	}
361
362	$result = substr ($c->buffer, $c->offset, $match - $c->offset);
363	$c->offset = $match + 1;
364
365	return array (PDF_TYPE_HEX, $result);
366	}
367
368	break;
369	case '<<':
370	// This is a dictionary.
371
372	$result = array();
373
374	// Recurse into this function until we reach
375	// the end of the dictionary.
376	while (($key = $this->pdf_read_token($c)) !== '>>') {
377	if ($key === false) {
378	return false;
379	}
380
381	if (($value = $this->pdf_read_value($c)) === false) {
382	return false;
383	}
384
385	// Catch missing value
386	if ($value[0] == PDF_TYPE_TOKEN && $value[1] == '>>') {
387	$result[$key] = array(PDF_TYPE_NULL);
388	break;
389	}
390
391	$result[$key] = $value;
392	}
393
394	return array (PDF_TYPE_DICTIONARY, $result);
395
396	case '[':
397	// This is an array.
398
399	$result = array();
400
401	// Recurse into this function until we reach
402	// the end of the array.
403	while (($token = $this->pdf_read_token($c)) !== ']') {
404	if ($token === false) {
405	return false;
406	}
407
408	if (($value = $this->pdf_read_value($c, $token)) === false) {
409	return false;
410	}
411
412	$result[] = $value;
413	}
414
415	return array (PDF_TYPE_ARRAY, $result);
416
417	case '(' :
418	// This is a string
419	$pos = $c->offset;
420
421	$openBrackets = 1;
422	do {
423	for (; $openBrackets != 0 && $pos < $c->length; $pos++) {
424	switch (ord($c->buffer[$pos])) {
425	case 0x28: // '('
426	$openBrackets++;
427	break;
428	case 0x29: // ')'
429	$openBrackets--;
430	break;
431	case 0x5C: // backslash
432	$pos++;
433	}
434	}
435	} while($openBrackets != 0 && $c->increase_length());
436
437	$result = substr($c->buffer, $c->offset, $pos - $c->offset - 1);
438	$c->offset = $pos;
439
440	return array (PDF_TYPE_STRING, $result);
441
442	case 'stream':
443	$o_pos = ftell($c->file)-strlen($c->buffer);
444	$o_offset = $c->offset;
445
446	$c->reset($startpos = $o_pos + $o_offset);
447
448	$e = 0; // ensure line breaks in front of the stream
449	if ($c->buffer[0] == chr(10) \|\| $c->buffer[0] == chr(13))
450	$e++;
451	if ($c->buffer[1] == chr(10) && $c->buffer[0] != chr(10))
452	$e++;
453
454	if ($this->actual_obj[1][1]['/Length'][0] == PDF_TYPE_OBJREF) {
455	$tmp_c = new pdf_context($this->f);
456	$tmp_length = $this->pdf_resolve_object($tmp_c, $this->actual_obj[1][1]['/Length']);
457	$length = $tmp_length[1][1];
458	} else {
459	$length = $this->actual_obj[1][1]['/Length'][1];
460	}
461
462	if ($length > 0) {
463	$c->reset($startpos+$e,$length);
464	$v = $c->buffer;
465	} else {
466	$v = '';
467	}
468	$c->reset($startpos+$e+$length+9); // 9 = strlen("endstream")
469
470	return array(PDF_TYPE_STREAM, $v);
471
472	default :
473	if (is_numeric ($token)) {
474	// A numeric token. Make sure that
475	// it is not part of something else.
476	if (($tok2 = $this->pdf_read_token ($c)) !== false) {
477	if (is_numeric ($tok2)) {
478
479	// Two numeric tokens in a row.
480	// In this case, we're probably in
481	// front of either an object reference
482	// or an object specification.
483	// Determine the case and return the data
484	if (($tok3 = $this->pdf_read_token ($c)) !== false) {
485	switch ($tok3) {
486	case 'obj' :
487	return array (PDF_TYPE_OBJDEC, (int) $token, (int) $tok2);
488	case 'R' :
489	return array (PDF_TYPE_OBJREF, (int) $token, (int) $tok2);
490	}
491	// If we get to this point, that numeric value up
492	// there was just a numeric value. Push the extra
493	// tokens back into the stack and return the value.
494	array_push ($c->stack, $tok3);
495	}
496	}
497
498	array_push ($c->stack, $tok2);
499	}
500
501	if ($token === (string)((int)$token))
502	return array (PDF_TYPE_NUMERIC, (int)$token);
503	else
504	return array (PDF_TYPE_REAL, (float)$token);
505	} else if ($token == 'true' \|\| $token == 'false') {
506	return array (PDF_TYPE_BOOLEAN, $token == 'true');
507	} else if ($token == 'null') {
508	return array (PDF_TYPE_NULL);
509	} else {
510	// Just a token. Return it.
511	return array (PDF_TYPE_TOKEN, $token);
512	}
513	}
514	}
515
516	/**
517	* Resolve an object
518	*
519	* @param object $c pdf_context
520	* @param array $obj_spec The object-data
521	* @param boolean $encapsulate Must set to true, cause the parsing and fpdi use this method only without this para
522	*/
523	function pdf_resolve_object(&$c, $obj_spec, $encapsulate = true) {
524	// Exit if we get invalid data
525	if (!is_array($obj_spec)) {
526	$ret = false;
527	return $ret;
528	}
529
530	if ($obj_spec[0] == PDF_TYPE_OBJREF) {
531
532	// This is a reference, resolve it
533	if (isset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]])) {
534
535	// Save current file position
536	// This is needed if you want to resolve
537	// references while you're reading another object
538	// (e.g.: if you need to determine the length
539	// of a stream)
540
541	$old_pos = ftell($c->file);
542
543	// Reposition the file pointer and
544	// load the object header.
545
546	$c->reset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]]);
547
548	$header = $this->pdf_read_value($c);
549
550	if ($header[0] != PDF_TYPE_OBJDEC \|\| $header[1] != $obj_spec[1] \|\| $header[2] != $obj_spec[2]) {
551	$toSearchFor = $obj_spec[1].' '.$obj_spec[2].' obj';
552	if (preg_match('/'.$toSearchFor.'/', $c->buffer)) {
553	$c->offset = strpos($c->buffer, $toSearchFor) + strlen($toSearchFor);
554	// reset stack
555	$c->stack = array();
556	} else {
557	$this->error("Unable to find object ({$obj_spec[1]}, {$obj_spec[2]}) at expected location");
558	}
559	}
560
561	// If we're being asked to store all the information
562	// about the object, we add the object ID and generation
563	// number for later use
564	$result = array();
565	$this->actual_obj =& $result;
566	if ($encapsulate) {
567	$result = array (
568	PDF_TYPE_OBJECT,
569	'obj' => $obj_spec[1],
570	'gen' => $obj_spec[2]
571	);
572	}
573
574	// Now simply read the object data until
575	// we encounter an end-of-object marker
576	while(1) {
577	$value = $this->pdf_read_value($c);
578	if ($value === false \|\| count($result) > 4) {
579	// in this case the parser coudn't find an endobj so we break here
580	break;
581	}
582
583	if ($value[0] == PDF_TYPE_TOKEN && $value[1] === 'endobj') {
584	break;
585	}
586
587	$result[] = $value;
588	}
589
590	$c->reset($old_pos);
591
592	if (isset($result[2][0]) && $result[2][0] == PDF_TYPE_STREAM) {
593	$result[0] = PDF_TYPE_STREAM;
594	}
595
596	return $result;
597	}
598	} else {
599	return $obj_spec;
600	}
601	}
602
603
604
605	/**
606	* Reads a token from the file
607	*
608	* @param object $c pdf_context
609	* @return mixed
610	*/
611	function pdf_read_token(&$c)
612	{
613	// If there is a token available
614	// on the stack, pop it out and
615	// return it.
616
617	if (count($c->stack)) {
618	return array_pop($c->stack);
619	}
620
621	// Strip away any whitespace
622
623	do {
624	if (!$c->ensure_content()) {
625	return false;
626	}
627	$c->offset += strspn($c->buffer, " \n\r\t", $c->offset);
628	} while ($c->offset >= $c->length - 1);
629
630	// Get the first character in the stream
631
632	$char = $c->buffer[$c->offset++];
633
634	switch ($char) {
635
636	case '[':
637	case ']':
638	case '(':
639	case ')':
640
641	// This is either an array or literal string
642	// delimiter, Return it
643
644	return $char;
645
646	case '<':
647	case '>':
648
649	// This could either be a hex string or
650	// dictionary delimiter. Determine the
651	// appropriate case and return the token
652
653	if ($c->buffer[$c->offset] == $char) {
654	if (!$c->ensure_content()) {
655	return false;
656	}
657	$c->offset++;
658	return $char . $char;
659	} else {
660	return $char;
661	}
662
663	case '%':
664
665	// This is a comment - jump over it!
666
667	$pos = $c->offset;
668	while(1) {
669	$match = preg_match("/(\r\n\|\r\|\n)/", $c->buffer, $m, PREG_OFFSET_CAPTURE, $pos);
670	if ($match === 0) {
671	if (!$c->increase_length()) {
672	return false;
673	} else {
674	continue;
675	}
676	}
677
678	$c->offset = $m[0][1]+strlen($m[0][0]);
679
680	return $this->pdf_read_token($c);
681	}
682
683	default:
684
685	// This is "another" type of token (probably
686	// a dictionary entry or a numeric value)
687	// Find the end and return it.
688
689	if (!$c->ensure_content()) {
690	return false;
691	}
692
693	while(1) {
694
695	// Determine the length of the token
696
697	$pos = strcspn($c->buffer, " %[]<>()\r\n\t/", $c->offset);
698
699	if ($c->offset + $pos <= $c->length - 1) {
700	break;
701	} else {
702	// If the script reaches this point,
703	// the token may span beyond the end
704	// of the current buffer. Therefore,
705	// we increase the size of the buffer
706	// and try again--just to be safe.
707
708	$c->increase_length();
709	}
710	}
711
712	$result = substr($c->buffer, $c->offset - 1, $pos + 1);
713
714	$c->offset += $pos;
715	return $result;
716	}
717	}
718	}
719	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: