php清除XSS通用类：CI框架的XSS移除类

作者：维易PHP培训学院时间 2017-08-06

《php清除XSS通用类：CI框架的XSS移除类》要点：
本文介绍了php清除XSS通用类：CI框架的XSS移除类，希望对您有用。如果有疑问，可以联系我们。

前端在线编辑器提供给用户录入文章内容，或论坛贴子，因此也会成为用户攻击的一个渠道，其中主要有XSS攻击和图片上传攻击等等。

本文着重讲解XSS攻击和过滤，让网站更安全。

下面是一个CI框架的XSS过滤类。处理这类攻击很好用。

调用：

$str = '
这是一个正常链接 <a href="http://asdfsd.com">链接1</a>，
这是一个有罪恶属性的链接 <a href="http://www.asdfsd.com" onmousemove="alert(\'a\')">链接1</a>，<br />
这是一个图片<img src="http://www.baidu.com/img/baidu_sylogo1.gif" border="0" onclick="alert(\'hi\')" />，
这是一个脚本<script>alert("hi")</script>
这是一个文本框<input type="text" size="14" onmousemove="alert(\'a\')" border="1" />  <br />
这是一个加粗字<strong>STRONG</strong>和<b>B</b>
';

$xssFilter = new cleanXSS();
$str =  $xssFilter->xss_clean($str,FALSE);

echo $str;

/*
这是一个正常链接 <a href="http://asdfsd.com">链接1</a>，
这是一个有罪恶属性的链接 <a >链接1</a>，<br />
这是一个图片<img  />，
这是一个脚本[removed]alert&#40;"hi"&#41;[removed]
这是一个文本框&lt;input type="text" size="14"  border="1" /&gt;&lt;br />
这是一个加粗字<strong>STRONG</strong>和<b>B</b>';



即：
任何有非法属性的脚本均标记为[removed]alert&#40;"hi"&#41;[removed]
对于链接、图片则只留空标签
注意：框<input type="text" size="14" onmousemove="alert(\'a\')" border="1" />  <br />这两个标
签，后面的<br />如果紧靠前面一个，则会被转化成&lt;br />

*/

类文件：

//适用于在线编辑器、URL
class cleanXSS
{
       protected $_xss_hash = '';
       protected $_never_allowed_str = array( 'document.cookie' => '[removed]', 'document.write' => '[removed]', '.parentNode' => '[removed]', '.innerHTML' => '[removed]', 'window.location' => '[removed]', '-moz-binding' => '[removed]', '<!--' => '&lt;!--', '-->' => '--&gt;', '<![CDATA[' => '&lt;![CDATA[', '<comment>' => '&lt;comment&gt;' );

       /* never allowed, regex replacement */
       /**
        * List of never allowed regex replacement
        *
        * @var array
        * @access protected
        */
       protected $_never_allowed_regex = array( 'javascript\s*:', 'expression\s*(\(|&\#40;)', // CSS and IE
              'vbscript\s*:', // IE, surprise!
              'Redirect\s+302', "([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?" );

       public function __construct ()
       {
       }

       public function xss_clean ( $str, $is_image = FALSE )
       {
              /*
               * Is the string an array?
               *
               */
              if ( is_array ( $str ) ) {
                     while ( list( $key ) = each ( $str ) ) {
                            $str[$key] = $this->xss_clean ( $str[$key] );
                     }

                     return $str;
              }

              /*
               * Remove Invisible Characters
               */
              $str = remove_invisible_characters ( $str );

              // Validate Entities in URLs
              $str = $this->_validate_entities ( $str );

              /*
               * URL Decode
               *
               * Just in case stuff like this is submitted:
               *
               * <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a>
               *
               * Note: Use rawurldecode() so it does not remove plus signs
               *
               */
              $str = rawurldecode ( $str );

              /*
               * Convert character entities to ASCII
               *
               * This permits our tests below to work reliably.
               * We only convert entities that are within tags since
               * these are the ones that will pose security problems.
               */

              $str = preg_replace_callback ( "/[a-z]+=([\'\"]).*?\\1/si", array( $this, '_convert_attribute' ), $str );
              $str = preg_replace_callback ( "/<\w+.*?(?=>|<|$)/si", array( $this, '_decode_entity' ), $str );

              /*
               * Remove Invisible Characters Again!
               */
              $str = remove_invisible_characters ( $str );

              /*
               * Convert all tabs to spaces
               * This prevents strings like this: ja vascript
               * NOTE: we deal with spaces between characters later.
               * NOTE: preg_replace was found to be amazingly slow here on
               * large blocks of data, so we use str_replace.
               */
              if ( strpos ( $str, "\t" ) !== FALSE ) {
                     $str = str_replace ( "\t", ' ', $str );
              }

              /*
               * Capture converted string for later comparison
               */
              $converted_string = $str;

              // Remove Strings that are never allowed
              $str = $this->_do_never_allowed ( $str );

              /*
               * Makes PHP tags safe
               *
               * Note: XML tags are inadvertently replaced too:
               *
               * <?xml
               *
               * But it doesn't seem to pose a problem.
               */
              if ( $is_image === TRUE ) {
                     // Images have a tendency to have the PHP short opening and
                     // closing tags every so often so we skip those and only
                     // do the long opening tags.
                     $str = preg_replace ( '/<\?(php)/i', "&lt;?\\1", $str );
              }
              else {
                     $str = str_replace ( array( '<?', '?' . '>' ), array( '&lt;?', '?&gt;' ), $str );
              }

              /*
               * Compact any exploded words
               * This corrects words like:  j a v a s c r i p t
               * These words are compacted back to their correct state.
               */
              $words = array( 'javascript', 'expression', 'vbscript', 'script', 'base64', 'applet', 'alert', 'document', 'write', 'cookie', 'window' );

              foreach ( $words as $word ) {
                     $temp = '';

                     for ( $i = 0, $wordlen = strlen ( $word ); $i < $wordlen; $i++ ) {
                            $temp .= substr ( $word, $i, 1 ) . "\s*";
                     }

                     // 再删除空白
                     // We only want to do this when it is followed by a non-word character
                     // That way valid stuff like "dealer to" does not become "dealerto"                  
                     // $temp的值比如："w\s*i\s*n\s*d\s*o\s*w\s*"，substr($temp, 0, -3)即截取头到最后面空格前的一个字母。
                     $str = preg_replace_callback ( '#(' . substr ( $temp, 0, -3 ) . ')(\W)#is', array( $this, '_compact_exploded_words' ), $str );
              }

              /*
               * Remove disallowed Javascript in links or img tags
               * We used to do some version comparisons and use of stripos for PHP5,
               * but it is dog slow compared to these simplified non-capturing
               * preg_match(), especially if the pattern exists in the string
               */
              do {
                     $original = $str;

                     if ( preg_match ( "/<a/i", $str ) ) {
                            $str = preg_replace_callback ( "#<a\s+([^>]*?)(>|$)#si", array( $this, '_js_link_removal' ), $str );
                     }

                     if ( preg_match ( "/<img/i", $str ) ) {
                            $str = preg_replace_callback ( "#<img\s+([^>]*?)(\s?/?>|$)#si", array( $this, '_js_img_removal' ), $str );
                     }

                     if ( preg_match ( "/script/i", $str ) OR preg_match ( "/xss/i", $str ) ) {
                            $str = preg_replace ( "#<(/*)(script|xss)(.*?)\>#si", '[removed]', $str );
                     }
              } while ( $original != $str );
              unset( $original );

              // Remove evil attributes such as style, onclick and xmlns
              $str = $this->_remove_evil_attributes ( $str, $is_image );

              /*
               * Sanitize naughty HTML elements
               * If a tag containing any of the words in the list
               * below is found, the tag gets converted to entities.
               * So this: <blink>
               * Becomes: &lt;blink&gt;
               */
              $naughty = 'alert|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object|plaintext|style|script|textarea|title|video|xml|xss';
              $str = preg_replace_callback ( '#<(/*\s*)(' . $naughty . ')([^><]*)([><]*)#is', array( $this, '_sanitize_naughty_html' ), $str );

              /*
               * Sanitize naughty scripting elements
               * Similar to above, only instead of looking for
               * tags it looks for PHP and JavaScript commands
               * that are disallowed.  Rather than removing the
               * code, it simply converts the parenthesis to entities
               * rendering the code un-executable.
               * For example:        eval('some code')
               * Becomes:           eval&#40;'some code'&#41;
               */
              $str = preg_replace ( '#(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2&#40;\\3&#41;", $str );


              // Final clean up
              // This adds a bit of extra precaution in case
              // something got through the above filters
              $str = $this->_do_never_allowed ( $str );

              /*
               * Images are Handled in a Special Way
               * - Essentially, we want to know that after all of the character
               * conversion is done whether any unwanted, likely XSS, code was found.
               * If not, we return TRUE, as the image is clean.
               * However, if the string post-conversion does not matched the
               * string post-removal of XSS, then it fails, as there was unwanted XSS
               * code found and removed/changed during processing.
               */

              if ( $is_image === TRUE ) {
                     return ( $str == $converted_string ) ? TRUE : FALSE;
              }

              $str = preg_replace ( '/(\[removed\]).*\\1/iUs', '', $str );
              return $str;
       }

       // --------------------------------------------------------------------

       /**
        * Random Hash for protecting URLs
        *
        * @return        string
        */
       public function xss_hash ()
       {
              if ( $this->_xss_hash == '' ) {
                     mt_srand ();
                     $this->_xss_hash = md5 ( time () + mt_rand ( 0, 1999999999 ) );
              }

              return $this->_xss_hash;
       }

       // --------------------------------------------------------------------

       /**
        * HTML Entities Decode
        * This function is a replacement for html_entity_decode()
        * The reason we are not using html_entity_decode() by itself is because
        * while it is not technically correct to leave out the semicolon
        * at the end of an entity most browsers will still interpret the entity
        * correctly.  html_entity_decode() does not convert entities without
        * semicolons, so we are left with our own little solution here. Bummer.
        *
        * @param        string
        * @param        string
        * @return        string
        */
       public function entity_decode ( $str, $charset = 'UTF-8' )
       {
              if ( stristr ( $str, '&' ) === FALSE ) {
                     return $str;
              }
              $str = html_entity_decode ( $str, ENT_COMPAT, $charset );
              $str = preg_replace ( '~&#x(0*[0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $str );
              return preg_replace ( '~&#([0-9]{2,4})~e', 'chr(\\1)', $str );
       }

       // --------------------------------------------------------------------

       /**
        * Filename Security
        * @param        string
        * @param        bool
        * @return        string
        */
       public function sanitize_filename ( $str, $relative_path = FALSE )
       {
              $bad = array( "../", "<!--", "-->", "<", ">", "'", '"', '&', '$', '#', '{', '}', '[', ']', '=', ';', '?', "%20", "%22", "%3c",                // <
                     "%253c",        // <
                     "%3e",                // >
                     "%0e",                // >
                     "%28",                // (
                     "%29",                // )
                     "%2528",        // (
                     "%26",                // &
                     "%24",                // $
                     "%3f",                // ?
                     "%3b",                // ;
                     "%3d"                // =
              );

              if ( !$relative_path ) {
                     $bad[] = './';
                     $bad[] = '/';
              }

              $str = remove_invisible_characters ( $str, FALSE );
              return stripslashes ( str_replace ( $bad, '', $str ) );
       }

       // ----------------------------------------------------------------

       /** $str = preg_replace_callback('#('.substr($temp, 0, -3).')(\W)#is', array($this, '_compact_exploded_words'), $str);
        * Compact Exploded Words
        *
        * Callback function for xss_clean() to remove whitespace from
        * things like j a v a s c r i p t
        *
        * @param        type
        * @return        type
        */
       protected function _compact_exploded_words ( $matches )
       {
              return preg_replace ( '/\s+/s', '', $matches[1] ) . $matches[2];
       }

       // --------------------------------------------------------------------

       /*
        * Remove Evil HTML Attributes (like evenhandlers and style)
        *
        * It removes the evil attribute and either:
        *     - Everything up until a space
        *            For example, everything between the pipes:
        *            <a |style=document.write('hello');alert('world');| class=link>
        *     - Everything inside the quotes
        *            For example, everything between the pipes:
        *            <a |style="document.write('hello'); alert('world');"| class="link">
        *
        * @param string $str The string to check
        * @param boolean $is_image TRUE if this is an image
        * @return string The string with the evil attributes removed
        */
       protected function _remove_evil_attributes ( $str, $is_image )
       {
              // All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns
              $evil_attributes = array( 'on\w*', 'style', 'xmlns', 'formaction' );

              if ( $is_image === TRUE ) {
                     /*
                      * Adobe Photoshop puts XML metadata into JFIF images, 
                      * including namespacing, so we have to allow this for images.
                      */
                     unset( $evil_attributes[array_search ( 'xmlns', $evil_attributes )] );
              }

              do {
                     $count = 0;
                     $attribs = array();

                     // find occurrences of illegal attribute strings without quotes
                     preg_match_all ( '/(' . implode ( '|', $evil_attributes ) . ')\s*=\s*([^\s>]*)/is', $str, $matches, PREG_SET_ORDER );

                     foreach ( $matches as $attr ) {

                            $attribs[] = preg_quote ( $attr[0], '/' );
                     }

                     // find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes)
                     preg_match_all ( "/(" . implode ( '|', $evil_attributes ) . ")\s*=\s*(\042|\047)([^\\2]*?)(\\2)/is", $str, $matches, PREG_SET_ORDER );

                     foreach ( $matches as $attr ) {
                            $attribs[] = preg_quote ( $attr[0], '/' );
                     }

                     // replace illegal attribute strings that are inside an html tag
                     if ( count ( $attribs ) > 0 ) {
                            $str = preg_replace ( "/<(\/?[^><]+?)([^A-Za-z<>\-])(.*?)(" . implode ( '|', $attribs ) . ")(.*?)([\s><])([><]*)/i", '<$1 $3$5$6$7', $str, -1, $count );
                     }

              } while ( $count );

              return $str;
       }

       // --------------------------------------------------------------------

       /**
        * Sanitize Naughty HTML
        *
        * Callback function for xss_clean() to remove naughty HTML elements
        *
        * @param        array
        * @return        string
        */
       protected function _sanitize_naughty_html ( $matches )
       {
              // encode opening brace
              $str = '&lt;' . $matches[1] . $matches[2] . $matches[3];

              // encode captured opening or closing brace to prevent recursive vectors
              $str .= str_replace ( array( '>', '<' ), array( '&gt;', '&lt;' ), $matches[4] );

              return $str;
       }

       // --------------------------------------------------------------------

       /**
        * JS Link Removal
        *
        * Callback function for xss_clean() to sanitize links
        * This limits the PCRE backtracks, making it more performance friendly
        * and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in
        * PHP 5.2+ on link-heavy strings
        *
        * @param        array
        * @return        string
        */
       protected function _js_link_removal ( $match )
       {
              return str_replace ( $match[1], preg_replace ( '#href=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si', '', $this->_filter_attributes ( str_replace ( array( '<', '>' ), '', $match[1] ) ) ), $match[0] );
       }

       // --------------------------------------------------------------------

       /**
        * JS Image Removal
        *
        * Callback function for xss_clean() to sanitize image tags
        * This limits the PCRE backtracks, making it more performance friendly
        * and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in
        * PHP 5.2+ on image tag heavy strings
        *
        * @param        array
        * @return        string
        */
       protected function _js_img_removal ( $match )
       {
              return str_replace ( $match[1], preg_replace ( '#src=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si', '', $this->_filter_attributes ( str_replace ( array( '<', '>' ), '', $match[1] ) ) ), $match[0] );
       }

       // --------------------------------------------------------------------

       /**
        * Attribute Conversion
        * Used as a callback for XSS Clean
        * @param        array
        * @return        string
        */
       protected function _convert_attribute ( $match )
       {
              return str_replace ( array( '>', '<', '\\' ), array( '&gt;', '&lt;', '\\\\' ), $match[0] );
       }

       // --------------------------------------------------------------------

       /**
        * Filter Attributes
        * Filters tag attributes for consistency and safety
        * @param        string
        * @return        string
        */
       protected function _filter_attributes ( $str )
       {
              $out = '';
              if ( preg_match_all ( '#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $str, $matches ) ) {
                     foreach ( $matches[0] as $match ) {
                            $out .= preg_replace ( "#/\*.*?\*/#s", '', $match );
                     }
              }
              return $out;
       }

       // --------------------------------------------------------------------

       /**
        * HTML Entity Decode Callback
        * Used as a callback for XSS Clean
        * @param        array
        * @return        string
        */
       protected function _decode_entity ( $match )
       {
              //return $this->entity_decode($match[0], strtoupper(config_item('charset')));
              return $this->entity_decode ( $match[0], 'utf-8' );
       }

       // --------------------------------------------------------------------

       /**
        * Validate URL entities
        * Called by xss_clean()
        * @param        string
        * @return        string
        */
       protected function _validate_entities ( $str )
       {
              /*
               * 检测GET变量中的URLs，先用xss_hash()替换掉"&"，稍后再换"&"回
               */
              // 901119URL5918AMP18930PROTECT8198           #$this->_xss_hash = md5(time() + mt_rand(0, 1999999999));
              $str = preg_replace ( '|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-]+)|i', $this->xss_hash () . "\\1=\\2", $str );

              /*
               * 验证标准字符实体。
               * 添加一个分号（如无分号的话），这么做是为了稍后转换实体到ASCII符
               */
              $str = preg_replace ( '#(&\#?[0-9a-z]{2,})([\x00-\x20])*;?#i', "\\1;\\2", $str );

              /*
               * 验证 UTF16 双字节编码 (x00)
               * 同上，如无分号则添加分号。
               */
              $str = preg_replace ( '#(&\#x?)([0-9A-F]+);?#i', "\\1\\2;", $str );

              /*
               * Un-Protect GET variables in URLs
               */
              $str = str_replace ( $this->xss_hash (), '&', $str );

              return $str;
       }

       // ----------------------------------------------------------------------

       /**
        * Do Never Allowed
        * A utility function for xss_clean()
        * @param        string
        * @return        string
        */
       protected function _do_never_allowed ( $str )
       {
              //把其中的document.cookie等替换成[removed]等。
              $str = str_replace ( array_keys ( $this->_never_allowed_str ), $this->_never_allowed_str, $str );
              foreach ( $this->_never_allowed_regex as $regex ) {
                     $str = preg_replace ( '#' . $regex . '#is', '[removed]', $str );
              }
              return $str;
       }
}

function remove_invisible_characters ( $str, $url_encoded = TRUE )
{
       $non_displayables = array();

       // every control character except newline (dec 10)
       // carriage return (dec 13), and horizontal tab (dec 09)

       if ( $url_encoded ) {
              $non_displayables[] = '/%0[0-8bcef]/';        // url encoded 00-08, 11, 12, 14, 15
              $non_displayables[] = '/%1[0-9a-f]/';        // url encoded 16-31
       }

       $non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S';        // 00-08, 11, 12, 14-31, 127

       do {
              $str = preg_replace ( $non_displayables, '', $str, -1, $count );
       } while ( $count );

       return $str;
}

转载请注明本页网址：
http://www.vephp.com/jiaocheng/155.html

标签：php 清除XSS

PHP教程

WEB前端开发

数据库

WEB服务器

APP开发

LINUX学习

后端开发课程

前端开发课程

数据库课程

php清除XSS通用类：CI框架的XSS移除类

相关教程

同类教程排行

特辑教程