.
*
* END LICENSE
*
* @author Chris Pollett chris@pollett.org
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009 - 2015
* @filesource
*/
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
* This component is used to provide activities for the admin controller related
* to configuring and performing a web or archive crawl
*
* @author Chris Pollett
* @package seek_quarry\controller\component
*/
class CrawlComponent extends Component implements CrawlConstants
{
/**
* Used to handle the manage crawl activity.
*
* This activity allows new crawls to be started, statistics about old
* crawls to be seen. It allows a user to stop the current crawl or
* restart an old crawl. It also allows a user to configure the options
* by which a crawl is conducted
*
* @return array $data information and statistics about crawls in the system
* as well as status messages on performing a given sub activity
*/
function manageCrawls()
{
$parent = $this->parent;
$crawl_model = $parent->model("crawl");
$possible_arguments =
array("start", "resume", "delete", "stop", "index", "options");
$data["ELEMENT"] = "managecrawls";
$data['SCRIPT'] = "doUpdate();";
$request_fields = array('start_row', 'num_show', 'end_row');
$flag = 0;
foreach($request_fields as $field) {
$data[strtoupper($field)] = isset($_REQUEST[$field]) ? max(0,
$parent->clean($_REQUEST[$field], 'int')) :
(isset($data['NUM_SHOW']) ? $data['NUM_SHOW'] :
$flag * DEFAULT_ADMIN_PAGING_NUM);
$flag = 1;
}
if(isset($_REQUEST['arg']) &&
in_array($_REQUEST['arg'], $possible_arguments)) {
$machine_urls = $parent->model("machine")->getQueueServerUrls();
$num_machines = count($machine_urls);
if($num_machines < 1 || ($num_machines == 1 &&
UrlParser::isLocalhostUrl($machine_urls[0]))) {
$machine_urls = NULL;
}
switch($_REQUEST['arg'])
{
case "start":
$this->startCrawl($data, $machine_urls);
$parent->redirectWithMessage(
tl('crawl_component_starting_new_crawl'),
$request_fields);
break;
case "stop":
$crawl_param_file = CRAWL_DIR."/schedules/crawl_params.txt";
if(file_exists($crawl_param_file)) {
unlink($crawl_param_file);
}
$info = array();
$info[self::STATUS] = "STOP_CRAWL";
$filename = CRAWL_DIR.
"/schedules/name_server_messages.txt";
file_put_contents($filename, serialize($info));
$crawl_model->sendStopCrawlMessage($machine_urls);
$parent->redirectWithMessage(
tl('crawl_component_stop_crawl'), $request_fields);
break;
case "resume":
$crawl_params = array();
$crawl_params[self::STATUS] = "RESUME_CRAWL";
$crawl_params[self::CRAWL_TIME] =
substr($parent->clean($_REQUEST['timestamp'], "int"),0,
TIMESTAMP_LEN);
$seed_info = $crawl_model->getCrawlSeedInfo(
$crawl_params[self::CRAWL_TIME], $machine_urls);
$this->getCrawlParametersFromSeedInfo($crawl_params,
$seed_info);
$crawl_params[self::TOR_PROXY] = TOR_PROXY;
if(USE_PROXY) {
$crawl_params[self::PROXY_SERVERS] =
explode("|Z|", PROXY_SERVERS);
}
/*
Write the new crawl parameters to the name server, so
that it can pass them along in the case of a new archive
crawl.
*/
$filename = CRAWL_DIR.
"/schedules/name_server_messages.txt";
file_put_contents($filename, serialize($crawl_params));
chmod($filename, 0777);
$crawl_model->sendStartCrawlMessage($crawl_params,
NULL, $machine_urls);
$parent->redirectWithMessage(
tl('crawl_component_resume_crawl'), $request_fields);
break;
case "delete":
if(isset($_REQUEST['timestamp'])) {
$timestamp = substr($parent->clean(
$_REQUEST['timestamp'], "int"), 0, TIMESTAMP_LEN);
$crawl_model->deleteCrawl($timestamp,
$machine_urls);
$parent->redirectWithMessage(
tl('crawl_component_delete_crawl_success'),
$request_fields);
} else {
$parent->redirectWithMessage(
tl('crawl_component_delete_crawl_fail'),
$request_fields);
}
break;
case "index":
$timestamp = substr($parent->clean($_REQUEST['timestamp'],
"int"), 0, TIMESTAMP_LEN);
$crawl_model->setCurrentIndexDatabaseName($timestamp);
$parent->redirectWithMessage(
tl('crawl_component_set_index'),
$request_fields);
break;
case "options":
$this->editCrawlOption($data, $machine_urls);
break;
}
}
return $data;
}
/**
* Called from @see manageCrawls to start a new crawl on the machines
* $machine_urls. Updates $data array with crawl start message
*
* @param array& $data an array of info to supply to AdminView
* @param array $machine_urls string urls of machines managed by this
* Yioop name server on which to perform the crawl
* @param array $seed_info allowed, disallowed, seed urls, etc to use in
* crawl
*/
function startCrawl(&$data, $machine_urls, $seed_info = NULL)
{
$parent = $this->parent;
$crawl_model = $parent->model("crawl");
$crawl_params = array();
$crawl_params[self::STATUS] = "NEW_CRAWL";
$crawl_params[self::CRAWL_TIME] = time();
$seed_info = $crawl_model->getSeedInfo();
$this->getCrawlParametersFromSeedInfo($crawl_params, $seed_info);
if(isset($_REQUEST['description'])) {
$description = substr(
$parent->clean($_REQUEST['description'], "string"), 0,
TITLE_LEN);
} else {
$description = tl('crawl_component_no_description');
}
$crawl_params['DESCRIPTION'] = $description;
$crawl_params[self::TOR_PROXY] = TOR_PROXY;
if(USE_PROXY) {
$crawl_params[self::PROXY_SERVERS] = explode("|Z|", PROXY_SERVERS);
}
$crawl_params[self::VIDEO_SOURCES] = array();
$sources =
$parent->model("source")->getMediaSources('video');
foreach($sources as $source) {
$url = $source['SOURCE_URL'];
$url_parts = explode("{}", $url);
$crawl_params[self::VIDEO_SOURCES][] = $url_parts[0];
}
if(isset($crawl_params[self::INDEXING_PLUGINS]) &&
is_array($crawl_params[self::INDEXING_PLUGINS])) {
foreach($crawl_params[self::INDEXING_PLUGINS] as $plugin) {
if($plugin == "") {continue;}
$plugin_class = $plugin."Plugin";
$plugin_obj = $parent->plugin(lcfirst($plugin));
if(method_exists($plugin_class, "loadConfiguration")) {
$crawl_params[self::INDEXING_PLUGINS_DATA][$plugin] =
$plugin_obj->loadConfiguration();
}
}
}
/*
Write the new crawl parameters to the name server, so
that it can pass them along in the case of a new archive
crawl.
*/
$filename = CRAWL_DIR.
"/schedules/name_server_messages.txt";
file_put_contents($filename, serialize($crawl_params));
chmod($filename, 0777);
$crawl_model->sendStartCrawlMessage($crawl_params,
$seed_info, $machine_urls);
}
/**
* Reads the parameters for a crawl from an array gotten from a crawl.ini
* file
*
* @param array& $crawl_params parameters to write to queue_server
* @param array $seed_info data from crawl.ini file
*/
function getCrawlParametersFromSeedInfo(&$crawl_params, $seed_info)
{
$parent = $this->parent;
$crawl_params[self::CRAWL_TYPE] = $seed_info['general']['crawl_type'];
$crawl_params[self::CRAWL_INDEX] =
(isset($seed_info['general']['crawl_index'])) ?
$seed_info['general']['crawl_index'] : '';
$crawl_params[self::ARC_DIR]=(isset($seed_info['general']['arc_dir'])) ?
$seed_info['general']['arc_dir'] : '';
$crawl_params[self::ARC_TYPE] =
(isset($seed_info['general']['arc_type'])) ?
$seed_info['general']['arc_type'] : '';
$crawl_params[self::CACHE_PAGES] =
(isset($seed_info['general']['cache_pages'])) ?
intval($seed_info['general']['cache_pages']) :
true;
$crawl_params[self::PAGE_RANGE_REQUEST] =
(isset($seed_info['general']['page_range_request'])) ?
intval($seed_info['general']['page_range_request']) :
PAGE_RANGE_REQUEST;
$crawl_params[self::MAX_DESCRIPTION_LEN] =
(isset($seed_info['general']['max_description_len'])) ?
intval($seed_info['general']['max_description_len']) :
MAX_DESCRIPTION_LEN;
$crawl_params[self::PAGE_RECRAWL_FREQUENCY] =
(isset($seed_info['general']['page_recrawl_frequency'])) ?
intval($seed_info['general']['page_recrawl_frequency']) :
PAGE_RECRAWL_FREQUENCY;
$crawl_params[self::TO_CRAWL] = $seed_info['seed_sites']['url'];
$crawl_params[self::CRAWL_ORDER] = $seed_info['general']['crawl_order'];
$crawl_params[self::RESTRICT_SITES_BY_URL] =
$seed_info['general']['restrict_sites_by_url'];
$crawl_params[self::ALLOWED_SITES] =
isset($seed_info['allowed_sites']['url']) ?
$seed_info['allowed_sites']['url'] : array();
$crawl_params[self::DISALLOWED_SITES] =
isset($seed_info['disallowed_sites']['url']) ?
$seed_info['disallowed_sites']['url'] : array();
if(isset($seed_info['indexed_file_types']['extensions'])) {
$crawl_params[self::INDEXED_FILE_TYPES] =
$seed_info['indexed_file_types']['extensions'];
}
if(isset($seed_info['general']['summarizer_option'])) {
$crawl_params[self::SUMMARIZER_OPTION] =
$seed_info['general']['summarizer_option'];
}
if(isset($seed_info['active_classifiers']['label'])) {
// Note that 'label' is actually an array of active class labels.
$crawl_params[self::ACTIVE_CLASSIFIERS] =
$seed_info['active_classifiers']['label'];
}
if(isset($seed_info['active_rankers']['label'])) {
// Note that 'label' is actually an array of active class labels.
$crawl_params[self::ACTIVE_RANKERS] =
$seed_info['active_rankers']['label'];
}
if(isset($seed_info['indexing_plugins']['plugins'])) {
$crawl_params[self::INDEXING_PLUGINS] =
$seed_info['indexing_plugins']['plugins'];
}
$crawl_params[self::PAGE_RULES] =
isset($seed_info['page_rules']['rule']) ?
$seed_info['page_rules']['rule'] : array();
}
/**
* Called from @see manageCrawls to edit the parameters for the next
* crawl (or current crawl) to be carried out by the machines
* $machine_urls. Updates $data array to be supplied to AdminView
*
* @param array& $data an array of info to supply to AdminView
* @param array $machine_urls string urls of machines managed by this
* Yioop name server on which to perform the crawl
*/
function editCrawlOption(&$data, $machine_urls)
{
$parent = $this->parent;
$crawl_model= $parent->model("crawl");
$data["leftorright"] = (getLocaleDirection() == 'ltr') ?
"right": "left";
$data["ELEMENT"] = "crawloptions";
$crawls = $crawl_model->getCrawlList(false, false,
$machine_urls);
$indexes = $crawl_model->getCrawlList(true, true, $machine_urls);
if(isset($_SESSION['USER_ID'])) {
$user = $_SESSION['USER_ID'];
} else {
$user = $_SERVER['REMOTE_ADDR'];
}
$mixes = $crawl_model->getMixList($user, false);
foreach($mixes as $mix) {
$tmp = array();
$tmp["DESCRIPTION"] = "MIX::".$mix["NAME"];
$tmp["CRAWL_TIME"] = $mix["TIMESTAMP"];
$tmp["ARC_DIR"] = "MIX";
$tmp["ARC_TYPE"] = "MixArchiveBundle";
$indexes[] = $tmp;
}
$add_message = "";
$indexes_by_crawl_time = array();
$update_flag = false;
$data['available_options'] = array(
tl('crawl_component_use_below'),
tl('crawl_component_use_defaults'));
$data['available_crawl_indexes'] = array();
$data['INJECT_SITES'] = "";
$data['options_default'] = tl('crawl_component_use_below');
foreach($crawls as $crawl) {
if(strlen($crawl['DESCRIPTION']) > 0 ) {
$data['available_options'][$crawl['CRAWL_TIME']] =
tl('crawl_component_previous_crawl')." ".
$crawl['DESCRIPTION'];
}
}
foreach($indexes as $i => $crawl) {
$data['available_crawl_indexes'][$crawl['CRAWL_TIME']]
= $crawl['DESCRIPTION'];
$indexes_by_crawl_time[$crawl['CRAWL_TIME']] =& $indexes[$i];
}
$no_further_changes = false;
$seed_current = $crawl_model->getSeedInfo();
if(isset($_REQUEST['load_option']) &&
$_REQUEST['load_option'] == 1) {
$seed_info = $crawl_model->getSeedInfo(true);
if(isset(
$seed_current['general']['page_range_request'])) {
$seed_info['general']['page_range_request'] =
$seed_current['general']['page_range_request'];
}
if(isset(
$seed_current['general']['page_recrawl_frequency'])
) {
$seed_info['general']['page_recrawl_frequency'] =
$seed_current['general']['page_recrawl_frequency'];
}
if(isset(
$seed_current['general']['max_description_len'])) {
$seed_info['general']['max_description_len'] =
$seed_current['general']['max_description_len'];
}
$update_flag = true;
$no_further_changes = true;
} else if (isset($_REQUEST['load_option']) &&
$_REQUEST['load_option'] > 1 ) {
$timestamp =
$parent->clean($_REQUEST['load_option'], "int");
$seed_info = $crawl_model->getCrawlSeedInfo(
$timestamp, $machine_urls);
if(isset(
$seed_current['general']['page_range_request'])) {
$seed_info['general']['page_range_request'] =
$seed_current['general']['page_range_request'];
}
if(isset(
$seed_current['general']['page_recrawl_frequency'])
) {
$seed_info['general']['page_recrawl_frequency'] =
$seed_current['general']['page_recrawl_frequency'];
}
if(isset(
$seed_current['general']['max_description_len'])) {
$seed_info['general']['max_description_len'] =
$seed_current['general']['max_description_len'];
}
$update_flag = true;
$no_further_changes = true;
} else if(isset($_REQUEST['ts'])) {
$timestamp = substr($parent->clean($_REQUEST['ts'], "int"), 0,
TIMESTAMP_LEN);
$seed_info = $crawl_model->getCrawlSeedInfo(
$timestamp, $machine_urls);
$data['ts'] = $timestamp;
} else {
$seed_info = $crawl_model->getSeedInfo();
}
if(isset($_REQUEST['suggest']) && $_REQUEST['suggest'] == 'add') {
$suggest_urls = $crawl_model->getSuggestSites();
if(isset($_REQUEST['ts'])) {
$new_urls = array();
} else {
$seed_info['seed_sites']['url'][] = "#\n#".
tl('crawl_component_added_urls', date('r'))."\n#";
$crawl_model->clearSuggestSites();
}
foreach($suggest_urls as $suggest_url) {
$suggest_url = trim($suggest_url);
if(!in_array($suggest_url, $seed_info['seed_sites']['url'])
&& strlen($suggest_url) > 0) {
if(isset($_REQUEST['ts'])) {
$new_urls[] = $suggest_url;
} else {
$seed_info['seed_sites']['url'][] = $suggest_url;
}
}
}
$add_message= tl('crawl_component_add_suggest');
if(isset($_REQUEST['ts'])) {
$data["INJECT_SITES"] = $parent->convertArrayLines($new_urls);
if($data["INJECT_SITES"] == "") {
$add_message= tl('crawl_component_no_new_suggests');
}
}
$update_flag = true;
$no_further_changes = true;
}
$page_options_properties = array('indexed_file_types',
'active_classifiers', 'page_rules', 'indexing_plugins');
//these properties should be changed under page_options not here
foreach($page_options_properties as $property) {
if(isset($seed_current[$property])) {
$seed_info[$property] = $seed_current[$property];
}
}
if(!$no_further_changes && isset($_REQUEST['crawl_indexes'])
&& in_array($_REQUEST['crawl_indexes'],
array_keys($data['available_crawl_indexes']))) {
$seed_info['general']['crawl_index'] = $_REQUEST['crawl_indexes'];
$index_data = $indexes_by_crawl_time[$_REQUEST['crawl_indexes']];
if(isset($index_data['ARC_DIR'])) {
$seed_info['general']['arc_dir'] = $index_data['ARC_DIR'];
$seed_info['general']['arc_type'] = $index_data['ARC_TYPE'];
} else {
$seed_info['general']['arc_dir'] = '';
$seed_info['general']['arc_type'] = '';
}
$update_flag = true;
}
$data['crawl_index'] = (isset($seed_info['general']['crawl_index'])) ?
$seed_info['general']['crawl_index'] : '';
$data['available_crawl_types'] = array(self::WEB_CRAWL,
self::ARCHIVE_CRAWL);
if(!$no_further_changes && isset($_REQUEST['crawl_type']) &&
in_array($_REQUEST['crawl_type'], $data['available_crawl_types'])) {
$seed_info['general']['crawl_type'] = $_REQUEST['crawl_type'];
$update_flag = true;
}
$data['crawl_type'] = $seed_info['general']['crawl_type'];
if($data['crawl_type'] == self::WEB_CRAWL) {
$data['web_crawl_active'] = "active";
$data['archive_crawl_active'] = "";
} else {
$data['archive_crawl_active'] = "active";
$data['web_crawl_active'] = "";
}
$data['available_crawl_orders'] = array(
self::BREADTH_FIRST =>
tl('crawl_component_breadth_first'),
self::PAGE_IMPORTANCE =>
tl('crawl_component_page_importance'));
if(!$no_further_changes && isset($_REQUEST['crawl_order']) &&
in_array($_REQUEST['crawl_order'],
array_keys($data['available_crawl_orders']))) {
$seed_info['general']['crawl_order'] = $_REQUEST['crawl_order'];
$update_flag = true;
}
$data['crawl_order'] = $seed_info['general']['crawl_order'];
if(!$no_further_changes && isset($_REQUEST['posted'])) {
$seed_info['general']['restrict_sites_by_url'] =
(isset($_REQUEST['restrict_sites_by_url'])) ?
true : false;
$update_flag = true;
}
$data['restrict_sites_by_url'] =
$seed_info['general']['restrict_sites_by_url'];
$site_types =
array('allowed_sites' => 'url', 'disallowed_sites' => 'url',
'seed_sites' => 'url');
foreach($site_types as $type => $field) {
if(!$no_further_changes && isset($_REQUEST[$type])) {
$seed_info[$type][$field] =
$parent->convertStringCleanArray(
$_REQUEST[$type], $field);
$update_flag = true;
}
if(isset($seed_info[$type][$field])) {
$data[$type] = $parent->convertArrayLines(
$seed_info[$type][$field]);
} else {
$data[$type] = "";
}
}
$data['TOGGLE_STATE'] =
($data['restrict_sites_by_url']) ?
"checked='checked'" : "";
$data['SCRIPT'] = "setDisplay('toggle', ".
"'{$data['restrict_sites_by_url']}');";
if(!isset($_REQUEST['ts'])) {
$data['SCRIPT'] .=
" elt('load-options').onchange = ".
"function() { if(elt('load-options').selectedIndex !=".
" 0) { elt('crawloptionsForm').submit(); }};";
}
if($data['crawl_type'] == CrawlConstants::WEB_CRAWL) {
$data['SCRIPT'] .=
"switchTab('webcrawltab', 'archivetab');";
} else {
$data['SCRIPT'] .=
"switchTab('archivetab', 'webcrawltab');";
}
$inject_urls = array();
if(isset($_REQUEST['ts']) &&
isset($_REQUEST['inject_sites']) && $_REQUEST['inject_sites']) {
$timestamp = substr($parent->clean($_REQUEST['ts'],
"string"), 0, TIMESTAMP_LEN);
$inject_urls =
$parent->convertStringCleanArray(
$_REQUEST['inject_sites']);
}
if($update_flag) {
if(isset($_REQUEST['ts'])) {
if($inject_urls != array()) {
$seed_info['seed_sites']['url'][] = "#\n#".
tl('crawl_component_added_urls', date('r'))."\n#";
$seed_info['seed_sites']['url'] = array_merge(
$seed_info['seed_sites']['url'], $inject_urls);
}
$crawl_model->setCrawlSeedInfo($timestamp,
$seed_info, $machine_urls);
if($inject_urls != array() &&
$crawl_model->injectUrlsCurrentCrawl(
$timestamp, $inject_urls, $machine_urls)) {
$add_message = "
".
tl('crawl_component_urls_injected');
if(isset($_REQUEST['use_suggest']) &&
$_REQUEST['use_suggest']) {
$crawl_model->clearSuggestSites();
}
}
} else {
$crawl_model->setSeedInfo($seed_info);
}
$parent->redirectWithMessage(
tl('crawl_component_update_seed_info'). " $add_message",
array("arg"));
}
return $data;
}
/**
* Handles admin requests for creating, editing, and deleting classifiers.
*
* This activity implements the logic for the page that lists existing
* classifiers, including the actions that can be performed on them.
*/
function manageClassifiers()
{
$parent = $this->parent;
$crawl_model = $parent->model("crawl");
$possible_arguments = array('createclassifier', 'editclassifier',
'finalizeclassifier', 'deleteclassifier', 'search');
$data['ELEMENT'] = 'manageclassifiers';
$data['SCRIPT'] = '';
$data['FORM_TYPE'] = '';
$search_array = array();
$request_fields = array('start_row', 'num_show', 'end_row');
$machine_urls = $parent->model("machine")->getQueueServerUrls();
$num_machines = count($machine_urls);
if ($num_machines < 1 || ($num_machines == 1 &&
UrlParser::isLocalhostUrl($machine_urls[0]))) {
$machine_urls = NULL;
}
$data['leftorright'] =
(getLocaleDirection() == 'ltr') ? 'right': 'left';
$classifiers = Classifier::getClassifierList();
$start_finalizing = false;
if(isset($_REQUEST['arg']) &&
in_array($_REQUEST['arg'], $possible_arguments)) {
if(isset($_REQUEST['name'])) {
$name = substr($parent->clean($_REQUEST['name'], 'string'), 0,
NAME_LEN);
$name = Classifier::cleanLabel($name);
} else if(isset($_REQUEST['class_label'])) {
$name = substr($parent->clean(
$_REQUEST['class_label'], 'string'), 0,
NAME_LEN);
$name = Classifier::cleanLabel($name);
} else {
$name = "";
}
switch ($_REQUEST['arg'])
{
case 'createclassifier':
if (!isset($classifiers[$name])) {
$classifier = new Classifier($name);
Classifier::setClassifier($classifier);
$classifiers[$name] = $classifier;
$parent->redirectWithMessage(
tl('crawl_component_new_classifier'),
$request_fields);
} else {
$parent->redirectWithMessage(
tl('crawl_component_classifier_exists'),
$request_fields);
}
break;
case 'deleteclassifier':
/*
In addition to deleting the classifier, we also want to
delete the associated crawl mix (if one exists) used to
iterate over existing indexes in search of new training
examples.
*/
if(isset($classifiers[$name])) {
unset($classifiers[$name]);
Classifier::deleteClassifier($name);
$mix_name = Classifier::getCrawlMixName($name);
$mix_time = $crawl_model->getCrawlMixTimestamp(
$mix_name);
if ($mix_time) {
$crawl_model->deleteCrawlMixIteratorState(
$mix_time);
$crawl_model->deleteCrawlMix($mix_time);
}
$parent->redirectWithMessage(
tl('crawl_component_classifier_deleted'),
$request_fields);
} else {
$parent->redirectWithMessage(
tl('crawl_component_no_classifier'),
$request_fields);
}
break;
case 'editclassifier':
if(isset($classifiers[$name])) {
$data['class_label'] = $name;
$this->editClassifier($data, $classifiers,
$machine_urls);
} else {
$parent->redirectWithMessage(
tl('crawl_component_no_classifier'),
$request_fields);
}
break;
case 'finalizeclassifier':
/*
Finalizing is too expensive to be done directly in the
controller that responds to the web request. Instead, a
daemon is launched to finalize the classifier
asynchronously and save it back to disk when it's done.
In the meantime, a flag is set to indicate the current
finalizing state.
*/
CrawlDaemon::start("classifier_trainer", $name, '', -1);
$classifier = $classifiers[$name];
$classifier->finalized = Classifier::FINALIZING;
$start_finalizing = true;
$data['SCRIPT'] .= "doMessage('