1 <?php
  2 /**
  3  * SimplePie
  4  *
  5  * A PHP-Based RSS and Atom Feed Framework.
  6  * Takes the hard work out of managing a complete RSS/Atom solution.
  7  *
  8  * Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
  9  * All rights reserved.
 10  *
 11  * Redistribution and use in source and binary forms, with or without modification, are
 12  * permitted provided that the following conditions are met:
 13  *
 14  *  * Redistributions of source code must retain the above copyright notice, this list of
 15  *    conditions and the following disclaimer.
 16  *
 17  *  * Redistributions in binary form must reproduce the above copyright notice, this list
 18  *    of conditions and the following disclaimer in the documentation and/or other materials
 19  *    provided with the distribution.
 20  *
 21  *  * Neither the name of the SimplePie Team nor the names of its contributors may be used
 22  *    to endorse or promote products derived from this software without specific prior
 23  *    written permission.
 24  *
 25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
 26  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 27  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
 28  * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 32  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 33  * POSSIBILITY OF SUCH DAMAGE.
 34  *
 35  * @package SimplePie
 36  * @version 1.3.1
 37  * @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
 38  * @author Ryan Parman
 39  * @author Geoffrey Sneddon
 40  * @author Ryan McCue
 41  * @link http://simplepie.org/ SimplePie
 42  * @license http://www.opensource.org/licenses/bsd-license.php BSD License
 43  */
 44 
 45 
 46 /**
 47  * Content-type sniffing
 48  *
 49  * Based on the rules in http://tools.ietf.org/html/draft-abarth-mime-sniff-06
 50  *
 51  * This is used since we can't always trust Content-Type headers, and is based
 52  * upon the HTML5 parsing rules.
 53  *
 54  *
 55  * This class can be overloaded with {@see SimplePie::set_content_type_sniffer_class()}
 56  *
 57  * @package SimplePie
 58  * @subpackage HTTP
 59  */
 60 class SimplePie_Content_Type_Sniffer
 61 {
 62     /**
 63      * File object
 64      *
 65      * @var SimplePie_File
 66      */
 67     var $file;
 68 
 69     /**
 70      * Create an instance of the class with the input file
 71      *
 72      * @param SimplePie_Content_Type_Sniffer $file Input file
 73      */
 74     public function __construct($file)
 75     {
 76         $this->file = $file;
 77     }
 78 
 79     /**
 80      * Get the Content-Type of the specified file
 81      *
 82      * @return string Actual Content-Type
 83      */
 84     public function get_type()
 85     {
 86         if (isset($this->file->headers['content-type']))
 87         {
 88             if (!isset($this->file->headers['content-encoding'])
 89                 && ($this->file->headers['content-type'] === 'text/plain'
 90                     || $this->file->headers['content-type'] === 'text/plain; charset=ISO-8859-1'
 91                     || $this->file->headers['content-type'] === 'text/plain; charset=iso-8859-1'
 92                     || $this->file->headers['content-type'] === 'text/plain; charset=UTF-8'))
 93             {
 94                 return $this->text_or_binary();
 95             }
 96 
 97             if (($pos = strpos($this->file->headers['content-type'], ';')) !== false)
 98             {
 99                 $official = substr($this->file->headers['content-type'], 0, $pos);
100             }
101             else
102             {
103                 $official = $this->file->headers['content-type'];
104             }
105             $official = trim(strtolower($official));
106 
107             if ($official === 'unknown/unknown'
108                 || $official === 'application/unknown')
109             {
110                 return $this->unknown();
111             }
112             elseif (substr($official, -4) === '+xml'
113                 || $official === 'text/xml'
114                 || $official === 'application/xml')
115             {
116                 return $official;
117             }
118             elseif (substr($official, 0, 6) === 'image/')
119             {
120                 if ($return = $this->image())
121                 {
122                     return $return;
123                 }
124                 else
125                 {
126                     return $official;
127                 }
128             }
129             elseif ($official === 'text/html')
130             {
131                 return $this->feed_or_html();
132             }
133             else
134             {
135                 return $official;
136             }
137         }
138         else
139         {
140             return $this->unknown();
141         }
142     }
143 
144     /**
145      * Sniff text or binary
146      *
147      * @return string Actual Content-Type
148      */
149     public function text_or_binary()
150     {
151         if (substr($this->file->body, 0, 2) === "\xFE\xFF"
152             || substr($this->file->body, 0, 2) === "\xFF\xFE"
153             || substr($this->file->body, 0, 4) === "\x00\x00\xFE\xFF"
154             || substr($this->file->body, 0, 3) === "\xEF\xBB\xBF")
155         {
156             return 'text/plain';
157         }
158         elseif (preg_match('/[\x00-\x08\x0E-\x1A\x1C-\x1F]/', $this->file->body))
159         {
160             return 'application/octect-stream';
161         }
162         else
163         {
164             return 'text/plain';
165         }
166     }
167 
168     /**
169      * Sniff unknown
170      *
171      * @return string Actual Content-Type
172      */
173     public function unknown()
174     {
175         $ws = strspn($this->file->body, "\x09\x0A\x0B\x0C\x0D\x20");
176         if (strtolower(substr($this->file->body, $ws, 14)) === '<!doctype html'
177             || strtolower(substr($this->file->body, $ws, 5)) === '<html'
178             || strtolower(substr($this->file->body, $ws, 7)) === '<script')
179         {
180             return 'text/html';
181         }
182         elseif (substr($this->file->body, 0, 5) === '%PDF-')
183         {
184             return 'application/pdf';
185         }
186         elseif (substr($this->file->body, 0, 11) === '%!PS-Adobe-')
187         {
188             return 'application/postscript';
189         }
190         elseif (substr($this->file->body, 0, 6) === 'GIF87a'
191             || substr($this->file->body, 0, 6) === 'GIF89a')
192         {
193             return 'image/gif';
194         }
195         elseif (substr($this->file->body, 0, 8) === "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A")
196         {
197             return 'image/png';
198         }
199         elseif (substr($this->file->body, 0, 3) === "\xFF\xD8\xFF")
200         {
201             return 'image/jpeg';
202         }
203         elseif (substr($this->file->body, 0, 2) === "\x42\x4D")
204         {
205             return 'image/bmp';
206         }
207         elseif (substr($this->file->body, 0, 4) === "\x00\x00\x01\x00")
208         {
209             return 'image/vnd.microsoft.icon';
210         }
211         else
212         {
213             return $this->text_or_binary();
214         }
215     }
216 
217     /**
218      * Sniff images
219      *
220      * @return string Actual Content-Type
221      */
222     public function image()
223     {
224         if (substr($this->file->body, 0, 6) === 'GIF87a'
225             || substr($this->file->body, 0, 6) === 'GIF89a')
226         {
227             return 'image/gif';
228         }
229         elseif (substr($this->file->body, 0, 8) === "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A")
230         {
231             return 'image/png';
232         }
233         elseif (substr($this->file->body, 0, 3) === "\xFF\xD8\xFF")
234         {
235             return 'image/jpeg';
236         }
237         elseif (substr($this->file->body, 0, 2) === "\x42\x4D")
238         {
239             return 'image/bmp';
240         }
241         elseif (substr($this->file->body, 0, 4) === "\x00\x00\x01\x00")
242         {
243             return 'image/vnd.microsoft.icon';
244         }
245         else
246         {
247             return false;
248         }
249     }
250 
251     /**
252      * Sniff HTML
253      *
254      * @return string Actual Content-Type
255      */
256     public function feed_or_html()
257     {
258         $len = strlen($this->file->body);
259         $pos = strspn($this->file->body, "\x09\x0A\x0D\x20");
260 
261         while ($pos < $len)
262         {
263             switch ($this->file->body[$pos])
264             {
265                 case "\x09":
266                 case "\x0A":
267                 case "\x0D":
268                 case "\x20":
269                     $pos += strspn($this->file->body, "\x09\x0A\x0D\x20", $pos);
270                     continue 2;
271 
272                 case '<':
273                     $pos++;
274                     break;
275 
276                 default:
277                     return 'text/html';
278             }
279 
280             if (substr($this->file->body, $pos, 3) === '!--')
281             {
282                 $pos += 3;
283                 if ($pos < $len && ($pos = strpos($this->file->body, '-->', $pos)) !== false)
284                 {
285                     $pos += 3;
286                 }
287                 else
288                 {
289                     return 'text/html';
290                 }
291             }
292             elseif (substr($this->file->body, $pos, 1) === '!')
293             {
294                 if ($pos < $len && ($pos = strpos($this->file->body, '>', $pos)) !== false)
295                 {
296                     $pos++;
297                 }
298                 else
299                 {
300                     return 'text/html';
301                 }
302             }
303             elseif (substr($this->file->body, $pos, 1) === '?')
304             {
305                 if ($pos < $len && ($pos = strpos($this->file->body, '?>', $pos)) !== false)
306                 {
307                     $pos += 2;
308                 }
309                 else
310                 {
311                     return 'text/html';
312                 }
313             }
314             elseif (substr($this->file->body, $pos, 3) === 'rss'
315                 || substr($this->file->body, $pos, 7) === 'rdf:RDF')
316             {
317                 return 'application/rss+xml';
318             }
319             elseif (substr($this->file->body, $pos, 4) === 'feed')
320             {
321                 return 'application/atom+xml';
322             }
323             else
324             {
325                 return 'text/html';
326             }
327         }
328 
329         return 'text/html';
330     }
331 }
332 
333