File "crawler-map.cls.php"
Full Path: /home/ycoalition/public_html/blog/wp-admin/js/widgets/plugins/litespeed-cache/src/crawler-map.cls.php
File size: 13.42 KB
MIME-type: text/x-php
Charset: utf-8
<?php
/**
* The Crawler Sitemap Class
*
* @since 1.1.0
*/
namespace LiteSpeed;
defined( 'WPINC' ) || exit;
class Crawler_Map extends Root {
const BM_MISS = 1;
const BM_HIT = 2;
const BM_BLACKLIST = 4;
private $_home_url; // Used to simplify urls
private $_tb;
private $__data;
private $_conf_map_timeout;
private $_urls = array();
/**
* Instantiate the class
*
* @since 1.1.0
*/
public function __construct() {
$this->_home_url = get_home_url();
$this->__data = Data::cls();
$this->_tb = $this->__data->tb( 'crawler' );
$this->_tb_blacklist = $this->__data->tb( 'crawler_blacklist' );
$this->_conf_map_timeout = $this->conf( Base::O_CRAWLER_MAP_TIMEOUT );
}
/**
* Save URLs crawl status into DB
*
* @since 3.0
* @access public
*/
public function save_map_status( $list, $curr_crawler ) {
global $wpdb;
Utility::compatibility();
$total_crawler = count( Crawler::cls()->list_crawlers() );
$total_crawler_pos = $total_crawler - 1;
// Replace current crawler's position
$curr_crawler = (int) $curr_crawler;
foreach ( $list as $bit => $ids ) { // $ids = [ id => [ url, code ], ... ]
if ( ! $ids ) {
continue;
}
Debug2::debug( "ππΊοΈ Update map [crawler] $curr_crawler [bit] $bit [count] " . count( $ids ) );
// Update res first, then reason
$right_pos = $total_crawler_pos - $curr_crawler;
$sql_res = "CONCAT( LEFT( res, $curr_crawler ), '$bit', RIGHT( res, $right_pos ) )";
$id_all = implode( ',', array_map( 'intval', array_keys( $ids ) ) );
$wpdb->query( "UPDATE `$this->_tb` SET res = $sql_res WHERE id IN ( $id_all )" );
// Add blacklist
if ( $bit == 'B' || $bit == 'N' ) {
$q = "SELECT a.id, a.url FROM `$this->_tb_blacklist` a LEFT JOIN `$this->_tb` b ON b.url=a.url WHERE b.id IN ( $id_all )";
$existing = $wpdb->get_results( $q, ARRAY_A );
// Update current crawler status tag in existing blacklist
if ( $existing ) {
$count = $wpdb->query( "UPDATE `$this->_tb_blacklist` SET res = $sql_res WHERE id IN ( " . implode( ',', array_column( $existing, 'id' ) ) . " )" );
Debug2::debug( 'ππΊοΈ Update blacklist [count] ' . $count );
}
// Append new blacklist
if ( count( $ids ) > count( $existing ) ) {
$new_urls = array_diff( array_column( $ids, 'url' ), array_column( $existing, 'url') );
Debug2::debug( 'ππΊοΈ Insert into blacklist [count] ' . count( $new_urls ) );
$q = "INSERT INTO `$this->_tb_blacklist` ( url, res, reason ) VALUES " . implode( ',', array_fill( 0, count( $new_urls ), '( %s, %s, %s )' ) );
$data = array();
$res = array_fill( 0, $total_crawler, '-' );
$res[ $curr_crawler ] = $bit;
$res = implode( '', $res );
$default_reason = $total_crawler > 1 ? str_repeat( ',', $total_crawler - 1 ) : ''; // Pre-populate default reason value first, update later
foreach ( $new_urls as $url ) {
$data[] = $url;
$data[] = $res;
$data[] = $default_reason;
}
$wpdb->query( $wpdb->prepare( $q, $data ) );
}
}
// Update sitemap reason w/ HTTP code
$reason_array = array();
foreach ( $ids as $id => $v2 ) {
$code = (int)$v2[ 'code' ];
if ( empty( $reason_array[ $code ] ) ) {
$reason_array[ $code ] = array();
}
$reason_array[ $code ][] = (int)$id;
}
foreach ( $reason_array as $code => $v2 ) {
// Complement comma
if ( $curr_crawler ) {
$code = ',' . $code;
}
if ( $curr_crawler < $total_crawler_pos ) {
$code .= ',';
}
$count = $wpdb->query( "UPDATE `$this->_tb` SET reason = CONCAT( SUBSTRING_INDEX( reason, ',', $curr_crawler ), '$code', SUBSTRING_INDEX( reason, ',', -$right_pos ) ) WHERE id IN (" . implode( ',', $v2 ) . ")" );
Debug2::debug( "ππΊοΈ Update map reason [code] $code [pos] left $curr_crawler right -$right_pos [count] $count" );
// Update blacklist reason
if ( $bit == 'B' || $bit == 'N' ) {
$count = $wpdb->query( "UPDATE `$this->_tb_blacklist` a LEFT JOIN `$this->_tb` b ON b.url = a.url SET a.reason = CONCAT( SUBSTRING_INDEX( a.reason, ',', $curr_crawler ), '$code', SUBSTRING_INDEX( a.reason, ',', -$right_pos ) ) WHERE b.id IN (" . implode( ',', $v2 ) . ")" );
Debug2::debug( "ππΊοΈ Update blacklist [code] $code [pos] left $curr_crawler right -$right_pos [count] $count" );
}
}
// Reset list
$list[ $bit ] = array();
}
return $list;
}
/**
* Add one record to blacklist
* NOTE: $id is sitemap table ID
*
* @since 3.0
* @access public
*/
public function blacklist_add( $id ) {
global $wpdb;
$id = (int)$id;
// Build res&reason
$total_crawler = count( Crawler::cls()->list_crawlers() );
$res = str_repeat( 'B', $total_crawler );
$reason = implode( ',', array_fill( 0, $total_crawler, 'Man' ) );
$row = $wpdb->get_row( "SELECT a.url, b.id FROM `$this->_tb` a LEFT JOIN `$this->_tb_blacklist` b ON b.url = a.url WHERE a.id = '$id'", ARRAY_A );
if ( ! $row ) {
Debug2::debug( 'ππΊοΈ blacklist failed to add [id] ' . $id );
return;
}
Debug2::debug( 'ππΊοΈ Add to blacklist [url] ' . $row[ 'url' ] );
$q = "UPDATE `$this->_tb` SET res = %s, reason = %s WHERE id = %d";
$wpdb->query( $wpdb->prepare( $q, array( $res, $reason, $id ) ) );
if ( $row[ 'id' ] ) {
$q = "UPDATE `$this->_tb_blacklist` SET res = %s, reason = %s WHERE id = %d";
$wpdb->query( $wpdb->prepare( $q, array( $res, $reason, $row[ 'id' ] ) ) );
}
else {
$q = "INSERT INTO `$this->_tb_blacklist` (url, res, reason) VALUES (%s, %s, %s)";
$wpdb->query( $wpdb->prepare( $q, array( $row[ 'url' ], $res, $reason ) ) );
}
}
/**
* Delete one record from blacklist
*
* @since 3.0
* @access public
*/
public function blacklist_del( $id ) {
global $wpdb;
if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) {
return;
}
$id = (int)$id;
Debug2::debug( 'ππΊοΈ blacklist delete [id] ' . $id );
$wpdb->query( "UPDATE `$this->_tb` SET res = REPLACE( REPLACE( res, 'N', '-' ), 'B', '-' ) WHERE url = ( SELECT url FROM `$this->_tb_blacklist` WHERE id = '$id' )" );
$wpdb->query( "DELETE FROM `$this->_tb_blacklist` WHERE id = '$id'" );
}
/**
* Empty blacklist
*
* @since 3.0
* @access public
*/
public function blacklist_empty() {
global $wpdb;
if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) {
return;
}
Debug2::debug( 'ππΊοΈ Truncate blacklist' );
$wpdb->query( "UPDATE `$this->_tb` SET res = REPLACE( REPLACE( res, 'N', '-' ), 'B', '-' )" );
$wpdb->query( "TRUNCATE `$this->_tb_blacklist`" );
}
/**
* List blacklist
*
* @since 3.0
* @access public
*/
public function list_blacklist( $limit = false, $offset = false ) {
global $wpdb;
if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) {
return array();
}
$q = "SELECT * FROM `$this->_tb_blacklist` ORDER BY id DESC";
if ( $limit !== false ) {
if ( $offset === false ) {
$total = $this->count_blacklist();
$offset = Utility::pagination( $total, $limit, true );
}
$q .= " LIMIT %d, %d";
$q = $wpdb->prepare( $q, $offset, $limit );
}
return $wpdb->get_results( $q, ARRAY_A );
}
/**
* Count blacklist
*/
public function count_blacklist() {
global $wpdb;
if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) {
return false;
}
$q = "SELECT COUNT(*) FROM `$this->_tb_blacklist`";
return $wpdb->get_var( $q );
}
/**
* Empty sitemap
*
* @since 3.0
* @access public
*/
public function empty_map() {
Data::cls()->tb_del( 'crawler' );
$msg = __( 'Sitemap cleaned successfully', 'litespeed-cache' );
Admin_Display::succeed( $msg );
}
/**
* List generated sitemap
*
* @since 3.0
* @access public
*/
public function list_map( $limit, $offset = false ) {
global $wpdb;
if ( ! $this->__data->tb_exist( 'crawler' ) ) {
return array();
}
if ( $offset === false ) {
$total = $this->count_map();
$offset = Utility::pagination( $total, $limit, true );
}
$q = "SELECT * FROM `$this->_tb` ORDER BY id LIMIT %d, %d";
return $wpdb->get_results( $wpdb->prepare( $q, $offset, $limit ), ARRAY_A );
}
/**
* Count sitemap
*/
public function count_map() {
global $wpdb;
if ( ! $this->__data->tb_exist( 'crawler' ) ) {
return false;
}
$q = "SELECT COUNT(*) FROM `$this->_tb`";
return $wpdb->get_var( $q );
}
/**
* Generate sitemap
*
* @since 1.1.0
* @access public
*/
public function gen() {
$count = $this->_gen();
if ( ! $count ) {
Admin_Display::error( __( 'No valid sitemap parsed for crawler.', 'litespeed-cache' ) );
return;
}
$msg = sprintf( __( 'Sitemap created successfully: %d items', 'litespeed-cache' ), $count );
Admin_Display::succeed( $msg );
}
/**
* Generate the sitemap
*
* @since 1.1.0
* @access private
*/
private function _gen() {
global $wpdb;
if ( ! $this->__data->tb_exist( 'crawler' ) ) {
$this->__data->tb_create( 'crawler' );
}
if ( ! $this->__data->tb_exist( 'crawler_blacklist' ) ) {
$this->__data->tb_create( 'crawler_blacklist' );
}
// use custom sitemap
if ( ! $sitemap = $this->conf( Base::O_CRAWLER_SITEMAP ) ) {
return false;
}
$offset = strlen( $this->_home_url );
try {
$this->_parse( $sitemap );
} catch( \Exception $e ) {
Debug2::debug( 'ππΊοΈ β failed to parse custom sitemap: ' . $e->getMessage() );
}
if ( is_array( $this->_urls ) && ! empty( $this->_urls ) ) {
if ( $this->conf( Base::O_CRAWLER_DROP_DOMAIN ) ) {
foreach ( $this->_urls as $k => $v ) {
if ( stripos( $v, $this->_home_url ) !== 0 ) {
unset( $this->_urls[ $k ] );
continue;
}
$this->_urls[ $k ] = substr( $v, $offset );
}
}
$this->_urls = array_unique( $this->_urls );
}
Debug2::debug( 'ππΊοΈ Truncate sitemap' );
$wpdb->query( "TRUNCATE `$this->_tb`" );
Debug2::debug( 'ππΊοΈ Generate sitemap' );
// Filter URLs in blacklist
$blacklist = $this->list_blacklist();
$full_blacklisted = array();
$partial_blacklisted = array();
foreach ( $blacklist as $v ) {
if ( strpos( $v[ 'res' ], '-' ) === false ) { // Full blacklisted
$full_blacklisted[] = $v[ 'url' ];
}
else {
// Replace existing reason
$v[ 'reason' ] = explode( ',', $v[ 'reason' ] );
$v[ 'reason' ] = array_map( function( $element ){ return $element ? 'Existed' : ''; }, $v[ 'reason' ] );
$v[ 'reason' ] = implode( ',', $v[ 'reason' ] );
$partial_blacklisted[ $v[ 'url' ] ] = array(
'res' => $v[ 'res' ],
'reason' => $v[ 'reason' ],
);
}
}
// Drop all blacklisted URLs
$this->_urls = array_diff( $this->_urls, $full_blacklisted );
// Default res & reason
$crawler_count = count( Crawler::cls()->list_crawlers() );
$default_res = str_repeat( '-', $crawler_count );
$default_reason = $crawler_count > 1 ? str_repeat( ',', $crawler_count - 1 ) : '';
$data = array();
foreach ( $this->_urls as $url ) {
$data[] = $url;
$data[] = array_key_exists( $url, $partial_blacklisted ) ? $partial_blacklisted[ $url ][ 'res' ] : $default_res;
$data[] = array_key_exists( $url, $partial_blacklisted ) ? $partial_blacklisted[ $url ][ 'reason' ] : $default_reason;
}
foreach ( array_chunk( $data, 300 ) as $data2 ) {
$this->_save( $data2 );
}
// Reset crawler
Crawler::cls()->reset_pos();
return count( $this->_urls );
}
/**
* Save data to table
*
* @since 3.0
* @access private
*/
private function _save( $data, $fields = 'url,res,reason' ) {
global $wpdb;
if ( empty( $data ) ) {
return;
}
$q = "INSERT INTO `$this->_tb` ( $fields ) VALUES ";
// Add placeholder
$q .= Utility::chunk_placeholder( $data, $fields );
// Store data
$wpdb->query( $wpdb->prepare( $q, $data ) );
}
/**
* Parse custom sitemap and return urls
*
* @since 1.1.1
* @access private
*/
private function _parse( $sitemap ) {
/**
* Read via wp func to avoid allow_url_fopen = off
* @since 2.2.7
*/
$response = wp_remote_get( $sitemap, array( 'timeout' => $this->_conf_map_timeout ) );
if ( is_wp_error( $response ) ) {
$error_message = $response->get_error_message();
Debug2::debug( 'ππΊοΈ failed to read sitemap: ' . $error_message );
throw new \Exception( 'Failed to remote read ' . $sitemap );
}
$xml_object = simplexml_load_string( $response[ 'body' ], null, LIBXML_NOCDATA );
if ( ! $xml_object ) {
if ( $this->_urls ) {
return;
}
throw new \Exception( 'Failed to parse xml ' . $sitemap );
}
// start parsing
$xml_array = (array) $xml_object;
if ( ! empty( $xml_array[ 'sitemap' ] ) ) { // parse sitemap set
if ( is_object( $xml_array[ 'sitemap' ] ) ) {
$xml_array[ 'sitemap' ] = (array) $xml_array[ 'sitemap' ];
}
if ( ! empty( $xml_array[ 'sitemap' ][ 'loc' ] ) ) { // is single sitemap
$this->_parse( $xml_array[ 'sitemap' ][ 'loc' ] );
}
else {
// parse multiple sitemaps
foreach ( $xml_array[ 'sitemap' ] as $val ) {
$val = (array) $val;
if ( ! empty( $val[ 'loc' ] ) ) {
$this->_parse( $val[ 'loc' ] ); // recursive parse sitemap
}
}
}
}
elseif ( ! empty( $xml_array[ 'url' ] ) ) { // parse url set
if ( is_object( $xml_array[ 'url' ] ) ) {
$xml_array[ 'url' ] = (array) $xml_array[ 'url' ];
}
// if only 1 element
if ( ! empty( $xml_array[ 'url' ][ 'loc' ] ) ) {
$this->_urls[] = $xml_array[ 'url' ][ 'loc' ];
}
else {
foreach ( $xml_array[ 'url' ] as $val ) {
$val = (array) $val;
if ( ! empty( $val[ 'loc' ] ) ) {
$this->_urls[] = $val[ 'loc' ];
}
}
}
}
}
}