Documentation is available at lucene-defs.php
- <?php
- /* ******************************************************************** */
- /* CATALYST PHP Source Code */
- /* -------------------------------------------------------------------- */
- /* This program is free software; you can redistribute it and/or modify */
- /* it under the terms of the GNU General Public License as published by */
- /* the Free Software Foundation; either version 2 of the License, or */
- /* (at your option) any later version. */
- /* */
- /* This program is distributed in the hope that it will be useful, */
- /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
- /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
- /* GNU General Public License for more details. */
- /* */
- /* You should have received a copy of the GNU General Public License */
- /* along with this program; if not, write to: */
- /* The Free Software Foundation, Inc., 59 Temple Place, Suite 330, */
- /* Boston, MA 02111-1307 USA */
- /* -------------------------------------------------------------------- */
- /* */
- /* Filename: lucene-defs.php */
- /* Author: Paul Waite */
- /* Description: NB: This module is a variant of the original lucene */
- /* module which processed fields line-by-line. This module */
- /* implements the XML interface to Lucene. */
- /* */
- /* Definitions for interfacing to the LUCENE search */
- /* engine system. LUCENE is a system which is optimised */
- /* for indexing and searching in a generic way. It is */
- /* implemented as a server accessible via a port over TCP. */
- /* This module understands the protocol that this server */
- /* uses to implement indexing and search queries. */
- /* */
- /* ******************************************************************** */
- /** @package search */
- include_once("search-defs.php");
- /** Stopwatch microtimer */
- ("timer-defs.php");
- /** XML classes */
- ("xml-defs.php");
- // ----------------------------------------------------------------------
- /** Do not wait on socket receive, return immediately */
- ("SOCK_NO_WAIT", 0);
- /** Wait on socket forever (well, 24hrs is that, more or less) */
- ("SOCK_FOREVER", 86400);
- /** Times to retry timed-out socket sends/receives */
- ("SOCK_RETRIES", 3);
- /** Used to indicate that a field should be indexed by Lucene */
- ("INDEXED", true);
- /** Used to indicate that a field should NOT be indexed by Lucene */
- ("NOT_INDEXED", false);
- /** Used to indicate that a field should be stored by Lucene */
- ("STORED", true);
- /** Used to indicate that a field should NOT be stored by Lucene */
- ("NOT_STORED", false);
- /** The name of the field Lucene should assume if none specified */
- ("DEFAULT_FIELD", "Text");
- /** Default type of field: 'Text', 'Date', 'Id' */
- ("DEFAULT_FIELDTYPE", "Text");
- /** Mode of index ID generation is by incrementing integer */
- ("ID_FROM_INC", 0);
- /** Mode of index ID generation is by filename stripped of path and extension */
- ("ID_FROM_NAME", 1);
- /** Mode of index ID generation is by full filename (incl. extension) */
- ("ID_FROM_FILENAME", 2);
- /** Mode of index ID generation is by full path to file */
- ("ID_FROM_PATH", 3);
- /** Indicates index fields come from meta tag extraction */
- ("META_TAG_FIELDS", true);
- // ----------------------------------------------------------------------
- /**
- * The lucene connection class
- * This class inherits the functionality of the 'search' class since mostly
- * that is what we will be connecting to Lucene for. The Indexing and
- * Control descendants can just ignore this inherited basic searching
- * functionality.
- * This class knows how to connect to a Lucene server and send and
- * receive messages to/from it. Child classes which need to talk to this
- * server to do indexing or querying should inherit this class.
- * @package search
- */
- class lucene_connection extends search {
- // Public
- /** HOST running the Lucene query server */
- var $host = "";
- /** PORT that the server is listening on */
- var $port = "";
- /** Timeout for send in seconds */
- var $timeoutsecs = 10;
- // Private
- /** Whether Lucene is enabled..
- @access private */
- var $enabled = true;
- /** The message waiting to be sent
- @access private */
- var $message = "";
- /** Raw response content we receive back from the Lucene server
- @access private */
- var $responsebuf = "";
- /** Socket file pointer
- @access private */
- var $sockfp = false;
- /** True if we are connected to socket
- @access private */
- var $connected = false;
- /** An execution timer
- @access private */
- var $timer;
- // .....................................................................
- /** Constructor - Lucene connection
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- * @param integer $timeoutsecs Seconds to timeout the connection
- */
- function lucene_connection($host="", $port="", $timeoutsecs="") {
- debugbr("Lucene connection: using XML interface v1.0");
- if ($host != "") {
- $this->connect($host, $port, $timeoutsecs);
- }
- $this->timer = new microtimer();
- } // lucene_connection
- // .....................................................................
- /**
- * Connect to the Lucene server. Optionally over-ride various settings
- * which were set in the constructor. Normally this method is only
- * called internally, in response to a request to send a message to
- * the Luceneserver.
- * @access private
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- * @param integer $timeoutsecs Seconds to timeout the connection
- */
- function connect($host="", $port="", $timeoutsecs="") {
- // Override host and port if given..
- if ($host != "") $this->host = $host;
- if ($port != "") $this->port = $port;
- // Utilise the Axyl configuration settings, if available..
- if (class_exists("configuration")) {
- $config = new configuration("sys_control");
- // This controls whether we have Lucene capability or not..
- if ($config->field_exists("Lucene Site Indexing")) {
- $this->enabled = $config->value("Lucene Site Indexing");
- }
- // Only set host & port if they have not been given yet..
- if ($this->host == "") {
- $this->host = $config->value("Lucene Host");
- $this->port = $config->value("Lucene Port");
- debugbr("acquired Axyl config: host=$this->host, port=$this->port");
- }
- }
- // Try to open socket if we have a host..
- $this->connected = false;
- if ($this->enabled && $this->host != "") {
- $this->sockfp = fsockopen($this->host, $this->port);
- if(!$this->sockfp) {
- $this->log_error("failed to connect to '$this->host:$this->port'");
- }
- else {
- if ($timeoutsecs != "") $this->timeoutsecs = $timeoutsecs;
- $this->set_timeout($this->timeoutsecs);
- $this->connected = true;
- debugbr("lucene_connection: connected to '$this->host:$this->port'");
- }
- }
- // Return result..
- return $this->connected;
- } // connect
- // .....................................................................
- /**
- * Disconnect from the Lucene server. Normally this is used only by
- * internal Luceneserver methods.
- * @access private
- */
- function disconnect() {
- if ($this->connected) {
- fclose($this->sockfp);
- $this->sockfp = false;
- }
- } // disconnect
- // .....................................................................
- /**
- * Set the socket timeout. Deals with the special case of setting
- * the socket to non-blocking mode (zero timeout)..
- * @param integer $timeoutsecs Set the timeout in seconds
- */
- function set_timeout($timeoutsecs) {
- if ($this->connected && $timeoutsecs != "") {
- $this->timeoutsecs = $timeoutsecs;
- if ($this->timeoutsecs != SOCK_NO_WAIT) {
- socket_set_timeout( $this->sockfp, $this->timeoutsecs);
- }
- socket_set_blocking( $this->sockfp, (($this->timeoutsecs == SOCK_NO_WAIT) ? false : true) );
- }
- } // set_timeout
- // .....................................................................
- /**
- * Sends a message to the Lucene server, and receives the response. We
- * operate on the understanding that every time we send something to
- * Lucene we expect a response. Since this method already calls the
- * recieve() method, there is no need to call it from your application.
- * The content to be sent is expected to be already in the class
- * string variable $message. The response is put into $response which
- * is an array of LF-delimited lines sent back.
- * @param integer $timeoutsecs Override for timeout in seconds
- * @return boolean True if the message was sent ok
- */
- function send($timeoutsecs="") {
- $send_ok = true;
- $this->response = array();
- if (!$this->connected) {
- $this->connect();
- }
- if ($this->connected) {
- // Check for timeout over-ride..
- if ($timeoutsecs != "") $this->timeoutsecs = $timeoutsecs;
- $this->set_timeout($this->timeoutsecs);
- // Send message..
- if ($this->message != "") {
- $this->timer->restart();
- $bytesput = fputs($this->sockfp, $this->message);
- $this->timer->stop();
- if (debugging()) {
- $buf = trim(substr(rawurldecode($this->message),0, 5000));
- debugbr("<pre>" . xmldump($buf) . "</pre>", DBG_DUMP);
- debugbr("lucene_connection: send transaction took " . $this->timer->formatted_millisecs() . "mS");
- }
- if ($bytesput != -1) {
- debugbr("lucene_connection: send ok ($bytesput bytes)");
- for ($i=0; $i< SOCK_RETRIES; $i++) {
- $send_ok = $this->receive();
- if ($send_ok) break;
- debugbr("lucene_connection: receive retry #" . ($i + 1));
- }
- }
- else {
- $this->log_error("write to server failed");
- $send_ok = false;
- }
- }
- else {
- $this->log_error("trying to send null content");
- $send_ok = false;
- }
- }
- else {
- $this->log_error("send with no open socket");
- $send_ok = false;
- }
- // Return status..
- return $send_ok;
- } // send
- // .....................................................................
- /**
- * Receive a message from the Lucene server. We can specify a timeout
- * period in seconds. If set to SOCK_NO_WAIT, it will return immediately with or
- * without a message. This is a low-level routine which deals with receiving the
- * message over TCP sockets.
- * @return boolean True if the message was received loud and clear
- * @access private
- */
- function receive() {
- $received_ok = true;
- if ($this->connected) {
- $this->timer->restart();
- $this->responsebuf = "";
- while (!feof($this->sockfp)) {
- $buf = fread($this->sockfp, 10000);
- if ($buf !== false) {
- $this->responsebuf .= $buf;
- }
- else {
- $this->log_error("no response from server");
- $received_ok = false;
- break;
- }
- }
- $this->timer->stop();
- if (debugging()) {
- debugbr("<pre>" . xmldump($this->responsebuf) . "</pre>", DBG_DUMP);
- debugbr("lucene_connection: response from server took " . $this->timer->formatted_millisecs() . "mS");
- }
- }
- else {
- $this->log_error("receive with no open socket");
- $received_ok = false;
- }
- // Return status..
- return $received_ok;
- } // receive
- // .....................................................................
- /** Log a message to the syslog and print info to debugger.
- * @access private
- */
- function log_error($err) {
- $prefix = (defined("APP_NAME") ? APP_NAME . ": " : "");
- $err = "Lucene error: " . get_class($this) . ": $this->host:$this->port: $err";
- debugbr($err);
- error_log($prefix . $err, 0);
- } // log_error
- } // lucene_connection class
- // ----------------------------------------------------------------------
- /** The lucene fieldset class. This holds the Lucene fields for a lucene
- * message. These fields comprise the list of tags which make up
- * a query message or an index message.
- * @access private
- * @package search
- */
- class lucene_fieldset {
- /** Fields stored as an array of XML <Field> tags */
- var $xmltags = array();
- // .....................................................................
- /** Constructor */
- function lucene_fieldset() { }
- // .....................................................................
- /**
- * Return a copy of the named field object from fieldset by name.
- * NOTES: This function will return a new field if it does not already
- * exist. In this case the field will not be stored until you use the
- * put() method to do so. Always returns a field object.
- * @param string $fieldname The name of the field to get
- * @return object An xmltag object for the field
- */
- function get_field($fieldname) {
- if (isset($this->xmltags[$fieldname])) {
- $field = $this->xmltags[$fieldname];
- }
- else {
- $field = new xmltag("Field");
- $field->setattribute("name", $fieldname);
- }
- return $field;
- } // get_field
- // .....................................................................
- /**
- * Puts the named field into fieldset, indexed by fieldname.
- * @param string $fieldname Unique name of the field in the set
- * @param object $field The field object to store
- */
- function put_field($fieldname, $field) {
- $this->xmltags[$fieldname] = $field;
- } // put_field
- // .....................................................................
- /** Define a field in the fieldset. Set the definition for a field
- * in this fieldset. If the field does not exist it is created and
- * its definition set. If it exists the definition is updated.
- * @param string $fieldname Name of the field
- * @param string $type Type of this field eg. "Date"
- * @param boolean $stored Whether field value should be stored by Lucene
- * @param boolean $indexed Whether field value should be indexed by Lucene
- */
- function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
- $field = $this->get_field($fieldname);
- $field->setattribute("type", $type);
- $field->setattribute("stored", ($stored ? "true" : "false"));
- $field->setattribute("indexed", ($indexed ? "true" : "false"));
- $this->put_field($fieldname, $field);
- } // define_field
- // .....................................................................
- /** Add a field to the fieldset.
- * @param string $fieldname Name of the field
- * @param string $fieldvalue Value to associate with this field
- */
- function add_field($fieldname, $fieldvalue="") {
- $field = $this->get_field($fieldname);
- $field->value = $fieldvalue;
- $this->put_field($fieldname, $field);
- } // add_field
- // .....................................................................
- /** Clear all fields from the fieldset */
- function clear() {
- $this->xmltags = array();
- } // clear
- // .....................................................................
- function render() {
- $s = "";
- foreach ($this->xmltags as $field) {
- $s .= $field->render();
- }
- return $s;
- } // render
- } // lucene_fieldset class
- // ----------------------------------------------------------------------
- /**
- * The lucene msg class. This is a raw class which holds the basic
- * message fields and data and knows how to build them into a full
- * message for sending to the lucene server.
- * @package search
- */
- class lucene_msg extends lucene_connection {
- // Public
- /** Type/name of this message */
- var $type = "";
- // Private
- /** Array containing XML tags
- @access private */
- var $xmltags = array();
- /** Object containing lucene fields
- @access private */
- var $fieldset;
- /** True if message has been built
- @access private */
- var $built = false;
- /** Error message if any error occurred
- @access private */
- var $error_msg = "";
- // .....................................................................
- /** Constructor
- * Notes: The application is either specified in the formal paramters or it
- * can be determined for an Axyl application by using the APP_PREFIX which
- * is unique to the application. This is the recommended option. Other
- * developers have, however, also used the configvalue 'Lucene Application'
- * for some reason, so this is still supported here. If none of these
- * methods results in a valid identifier, 'default' is used.
- * @param string $type Type of message this is, eg; QUERY, INDEX..
- * @param string $application The application name. Sets default Lucene config.
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_msg($type="", $application="?", $host="", $port="") {
- $this->lucene_connection($host, $port);
- $this->type = $type;
- $this->fieldset = new lucene_fieldset();
- // We must have an application..
- if ($application == "?") {
- if (class_exists("configuration")) {
- $config = new configuration("sys_control");
- $application = $config->value("Lucene Application");
- }
- // Axyl configuration value may not be defined and
- // the APP_PREFIX will be used in this case..
- if ($application == "" || $application == "?") {
- if ( defined("APP_PREFIX")) {
- $application = APP_PREFIX;
- }
- else {
- // The default case for standalone apps..
- $application = "default";
- }
- }
- }
- // Set the application..
- $this->set_application($application);
- } // lucene_msg
- // .....................................................................
- /**
- * Add a new XML tag object to this Lucene message
- * @param object $tag Tha xmltag object to add to our lucene msg
- */
- function add_xmltag($tag) {
- $this->xmltags[] = $tag;
- $this->built = false;
- } // add_xmltag
- // .....................................................................
- /**
- * Specify the application. The application is the name of a configuration
- * set which has been specified either by a control message, or by using
- * configuration files on the server. A given configuration set identified
- * by an application name can have specific fields already defined, such
- * as Sort: or Domain: etc.
- * Notes: The 'Application' header can only appear once in the message.
- * @param string $application The application name to set.
- */
- function set_application($application) {
- $this->add_xmltag( new xmltag("Application", $application) );
- } // set_application
- // .....................................................................
- /**
- * Specify a domain. A domain is an identifier which groups indexed
- * objects internally to Lucene. This allows searches on multiple
- * archives of documents in a single Lucene installation.
- * Notes: There may be zero or more domain headers in the message. If it
- * does not appear, then any domain header defined for the application
- * will be applied on its own. Otherwise any definitions added by this
- * method are OR'd with any specified in the application config.
- * NB: If no domains are specified anywhere, any searching will be done
- * across all domains (which would probably yield very confusing return
- * data!).
- * @param string $domain The domain to set.
- */
- function set_domain($domain) {
- $this->add_xmltag( new xmltag("Domain", $domain) );
- } // set_domain
- // .....................................................................
- /** Add a field to the fieldset.
- * @param string $fieldname Name of the field
- * @param string $fieldvalue Value to associate with this field
- */
- function add_field($fieldname, $fieldvalue="") {
- $this->fieldset->add_field($fieldname, $fieldvalue);
- $this->built = false;
- } // add_field
- // .....................................................................
- /** Clear all data/fields, leaving type definition alone. */
- function clear() {
- $this->fieldset->clear();
- $this->message = "";
- $this->built = false;
- } // clear
- // .....................................................................
- /**
- * Builds the message according to the message type. This method
- * may be over-ridden in children inheriting this class
- * @access private
- */
- function build() {
- if (!$this->built) {
- if ($this->type != "") {
- $xml = new xmltag($this->type);
- // XML TAGS
- foreach ($this->xmltags as $tag) {
- $xml->childtag($tag);
- }
- // FIELDS
- if (count($this->fieldset->xmltags) > 0) {
- $fields = new xmltag("Fields");
- foreach ($this->fieldset->xmltags as $field) {
- $fields->childtag($field);
- }
- $xml->childtag($fields);
- }
- $this->message = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $xml->render();
- $this->built = true;
- }
- }
- return $this->built;
- } // build
- // .....................................................................
- /**
- * Sends the current message to Lucene, and checks for protocol
- * errors in the received response.
- * @param integer $timeoutsecs Override for timeout in seconds
- */
- function send($timeoutsecs="") {
- if ($this->build()) {
- // Low-level socket send-receive transaction..
- lucene_connection::send($timeoutsecs);
- // Once a msg is sent, socket can be closed..
- $this->disconnect();
- }
- } // send
- } // lucene_msg class
- // ----------------------------------------------------------------------
- /**
- * The lucene message class. This class extends its parent class
- * lucene_msg and adds some higher level methods for adding groups of
- * fields to the message.
- * @package search
- */
- class lucene_message extends lucene_msg {
- /** Response object which will parse XML content
- @access private */
- var $response;
- // .....................................................................
- /** Constructor
- * This is a more complex class which builds on the basic lucene_msg
- * class to provide some higher level methods for adding fields in
- * specific ways to support CONTROL, QUERY and INDEX message types.
- * @param string $type Type of message this is, eg; QUERY, INDEX..
- * @param string $application The application name. Sets default Lucene config.
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_message($type="", $application="?", $host="", $port="") {
- $this->lucene_msg($type, $application, $host, $port);
- } // lucene_message
- // .....................................................................
- /**
- * Strip field type specifiers out of field strings. A field string with
- * a type specifier in it is of the form: 'Foo:Date', where the field
- * name is 'Foo' and the field type is 'Date'. Possible field types are
- * 'Id', 'Text' (the default), and 'Date'.
- * Note that sort field specification is a special case, where the syntax
- * can be 'Foo:Date:Desc' or 'Foo:Desc' indicating the sort on the given
- * field should be done in descending order.
- * At present you would only use this facility with a 'Date' field, and
- * everything else would then default to 'Text'. [The 'Id' type being a
- * special one]
- * We return the field stripped of any type, and if a type was present
- * we issue the define_field() directive to define it. A field so-defined
- * will always be both stored by Lucene and indexed.
- * @param string $field Field in 'Foo:Date' format, or just 'Foo' for default type
- * @return string The fieldname stripped of any type specifier
- * @access private
- */
- function strip_field_type($field) {
- $fieldname = $field;
- $retfieldname = $field;
- if (strstr($field, ":")) {
- // Extract field specifier parts..
- $bits = explode(":", $field);
- $fieldname = trim( array_shift($bits) );
- $retfieldname = $fieldname;
- $f1 = trim(array_shift($bits));
- $f2 = trim(array_shift($bits));
- // Check for a sort field with DESC specifier..
- if ($f1 == "Desc" || $f2 == "Desc") {
- $retfieldname .= ":Desc";
- }
- // Check for valid field type specifier..
- if ($f1 == "Date" || $f1 == "Text" || $f1 == "Id") {
- // Define field by name..
- $this->define_field($fieldname, $f1);
- }
- }
- // Return fieldname plus any sort spec..
- return $retfieldname;
- } // strip_field_type
- // .....................................................................
- /**
- * Define a field. We supply the name of the field, it's type (Text, Date
- * or Id), and whether it should be stored by Lucene for later retreival
- * in queries. For example you would not store the raw document/content as
- * this is usually stored elsewhere.
- * We also cater for fields which might not need to be indexed. These would
- * be fields of data you just want to return with the document, if found in
- * a query, but not search on. An example might be a field containing the
- * path to the physical document on disk. For these fields you would then
- * specify NOT_INDEXED for the $indexed parameter. These fields MUST be
- * stored, so we make the rule: if the field is NOT_INDEXED then it must
- * be STORED (this will be forced).
- * In the normal course of events, fields will be defined to be both stored
- * and indexed. The exception is the special "Text" field associated with
- * an item "Body", which is indexed, but never stored.
- * This method adds the field settings directly via the add_field() method.
- * @see add_field()
- * @param string $fieldname Name of the field to index
- * @param string $type Type of field data: Text, Date or Id.
- * @param boolean $stored If true then Lucene will store the content itself
- * @param boolean $indexed If true then Lucene will index the field content
- */
- function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
- // Force non-indexed fields to be stored..
- if ($indexed == NOT_INDEXED) $stored = STORED;
- $this->fieldset->define_field($fieldname, $type, $stored, $indexed);
- } // define_field
- // .....................................................................
- /**
- * Specify the fields you want returned from Lucene.
- * Fields should be in a comma-separated list of field names. Each field
- * name can have the field type included in the form 'Foo:Date', where
- * 'Date' is the type in this instance. In fact, since 'Text' is the
- * default filed type, 'Date' is probably the only one you need to use
- * as the current implementation stands.
- * This method adds the field setting directly via the add_field() method.
- * @see add_field
- * @param mixed $fields Comma-delimited fieldname list, or array of fields
- */
- function set_returnfields($fields) {
- if (!is_array($fields)) {
- $flds = explode(",", $fields);
- }
- else {
- $flds = $fields;
- }
- $returnfields = array();
- foreach ($flds as $field) {
- $returnfields[] = $this->strip_field_type($field);
- }
- $returnlist = implode(" ", $returnfields);
- $this->add_xmltag( new xmltag("Return", $returnlist) );
- } // set_returnfields
- // .....................................................................
- /**
- * Specify query limit field. This sets the maximum number of results
- * that Lucene should return.
- * @param integer $limit Maximum number of results (hits) to return
- */
- function set_limit($limit) {
- $this->add_xmltag( new xmltag("Limit", $limit) );
- } // set_limit
- // .....................................................................
- /**
- * Specify query offset field 'First'. This sets the offset for the
- * returned results. For example, if this was set to 3, and Lucene
- * found 20 hits, then results would be sent back from the 3rd hit
- * onwards.
- * @param integer $first Offset in result set to start from
- */
- function set_first($first) {
- $this->add_xmltag( new xmltag("First", $first) );
- } // set_first
- // .....................................................................
- /**
- * Specify the fields you want query results to be ordered by.
- * Fields should be in a comma-separated list of field names. Each field
- * name can have the field type included in the form 'Foo:Date', where
- * 'Date' is the type in this instance. In fact, since 'Text' is the
- * default filed type, 'Date' is probably the only one you need to use
- * as the current implementation stands.
- * Note that sort field specification is a special case, where the syntax
- * can be 'Foo:Date:Desc' or 'Foo:Desc' indicating the sort on the given
- * field should be done in descending order.
- * @param mixed $fields Comma-delimited fieldname list, or array of fields
- */
- function set_sortorder($fields) {
- if (!is_array($fields)) {
- $flds = explode(",", $fields);
- }
- else {
- $flds = $fields;
- }
- $sortfields = array();
- foreach ($flds as $field) {
- $sortfields[] = $this->strip_field_type($field);
- }
- // Create the field..
- $sortlist = implode(" ", $sortfields);
- $this->add_xmltag( new xmltag("Sort", $sortlist) );
- } // set_sortorder
- // .....................................................................
- /**
- * Specify a range on a field for querying. We specify the name of a field
- * which is used to select articles within the given limits, and
- * the limits themeselves. Either limit may be passed as nullstring
- * which indicates no limit on that side. Any dates must be passed as
- * standard Unix timestamps (seconds since 1970).
- * Notes: This method can be called multiple times to define additional
- * ranges for different field names.
- * This method adds the field setting directly via the add_field() method.
- * @see add_field
- * @param string $range_from Value of lowerbound range
- * @param string $range_to Value of upperbound range
- * @param string $range_fieldname Name of field to use in range query.
- */
- function set_range($range_from="", $range_to="", $range_fieldname="") {
- if ($range_fieldname != "") {
- $range = new xmltag("Range");
- $range->setattribute("field", $this->strip_field_type($range_fieldname));
- if ($range_from != "" && $range_from != false) {
- $range->childtag( new xmltag("From", $range_from) );
- }
- if ($range_to != "" && $range_to != false) {
- $range->childtag( new xmltag("To", $range_to) );
- }
- $this->add_xmltag( $range );
- }
- } // set_range
- // .....................................................................
- /**
- * Supply a stopword list to lucene.
- * This method adds the field setting directly via the add_field() method.
- * @see add_field
- * @param mixed $stopwords Space-delimited list, or array of stopwords
- */
- function set_stopwords($stopwords) {
- if (is_array($stopwords)) {
- $mystops = implode(" ", $stopwords);
- }
- else {
- $mystops = $stopwords;
- }
- $this->add_xmltag( new xmltag("Stop-List", $mystops) );
- } // set_stopwords
- } // lucene_message class
- // ----------------------------------------------------------------------
- /**
- * Encapsulation of the result of a generic search query. This is for
- * internal use only.
- * @package search
- * @access private
- */
- class queryresult {
- var $rank = "";
- var $fields = array();
- function queryresult($rank="") {
- $this->rank = $rank;
- }
- function addfield($fieldname, $fieldvalue) {
- $this->fields[$fieldname] = $fieldvalue;
- }
- } // queryresult class
- // ----------------------------------------------------------------------
- /**
- * Class comprising the functionality of a Lucene response parser. This
- * is for internal use only.
- * @package search
- * @access private
- */
- class response_parser extends xmlparser {
- /** Current/last tag opened */
- var $tag = "";
- /** Attributes array for current/last tag */
- var $attr = array();
- /** Serial transaction ID */
- var $serial = "";
- /** Status message */
- var $status_message = "";
- /** True if response was valid, ie. no errors */
- var $valid = true;
- // .....................................................................
- /** Construct a new parser. */
- function response_parser() {
- $this->xmlparser();
- } // response_parser
- // .....................................................................
- /** Method invoked when a tag is opened */
- function tag_open($parser, $tag, $attributes) {
- $this->tag = $tag;
- if (is_array($attributes) && count($attributes) > 0) {
- foreach ($attributes as $key => $value ) {
- $this->attr[$key] = $value;
- }
- }
- switch ($tag) {
- case "Error":
- $this->valid = false;
- break;
- } // switch
- } // tag_open
- // .....................................................................
- /** Method invoked when character data is available */
- function cdata($parser, $cdata) {
- switch ($this->tag) {
- case "Error":
- $this->error_message = $cdata;
- debugbr("lucene error: $this->error_message");
- break;
- case "Status":
- $this->status_message = $cdata;
- debugbr("lucene status: $this->status_message");
- break;
- case "Serial":
- $this->serial = $cdata;
- break;
- } // switch
- } // cdata
- // .....................................................................
- /** Method invoked when a tag is closed */
- function tag_close($parser, $tag) {
- $this->tag = "";
- $this->attr = array();
- } // tag_close
- // .....................................................................
- function parse($xml) {
- xmlparser::parse($xml);
- if (!$this->valid_xml) {
- $this->valid = false;
- }
- if ($this->error_message != "") {
- log_sys($this->error_message);
- }
- } // parse
- } // response_parser class
- // ----------------------------------------------------------------------
- /**
- * Class comprising the functionality of an XML parser for queries. This
- * is for internal use only.
- * @package search
- * @access private
- */
- class queryresponse_parser extends response_parser {
- /** Results returned count */
- var $count = 0;
- var $results;
- var $results_stream = false;
- // .....................................................................
- /** Construct a new parser. */
- function queryresponse_parser() {
- $this->response_parser();
- } // queryresponse_parser
- // .....................................................................
- /** Method invoked when a tag is opened */
- function tag_open($parser, $tag, $attributes) {
- response_parser::tag_open($parser, $tag, $attributes);
- switch ($tag) {
- case "Results":
- $this->results_stream = true;
- break;
- case "Result":
- $this->addresult(
- $this->attr["counter"],
- $this->attr["rank"]
- );
- $this->attr = array();
- break;
- } // switch
- } // tag_open
- // .....................................................................
- /** Method invoked when character data is available */
- function cdata($parser, $cdata) {
- response_parser::cdata($parser, $cdata);
- switch ($this->tag) {
- case "Count":
- $this->count = $cdata;
- break;
- case "Field":
- if ($this->results_stream) {
- if (count($this->attr) > 0) {
- $result = array_pop($this->results);
- $fieldname = $this->attr["name"];
- $fieldval = $cdata;
- $result->addfield($fieldname, $fieldval);
- array_push($this->results, $result);
- }
- $this->attr = array();
- }
- break;
- } // switch
- } // cdata
- // .....................................................................
- /** Method invoked when a tag is closed */
- function tag_close($parser, $tag) {
- response_parser::tag_close($parser, $tag);
- switch ($tag) {
- case "Results":
- $this->results_stream = false;
- break;
- } // switch
- } // tag_close
- // .....................................................................
- /** Add a result field to the response */
- function addresult($id, $rank) {
- $this->results[$id] = new queryresult($rank);
- } // addresult
- } // queryresponse_parser class
- // ----------------------------------------------------------------------
- /**
- * The lucene query message class. This class inherits all the functionality
- * of the lucene_connection, lucene_msg and lucene_message classes. It adds
- * query-specific methods for searching.
- * @package search
- */
- class lucene_querymsg extends lucene_message {
- /** Set to true if sort limit was exceeded in query */
- var $sort_limit_exceeded = false;
- /** Set to true if Lucene blew its memory trying to sort */
- var $sort_memory_exceeded = false;
- // .....................................................................
- /** Constructor
- * Make a new Lucene query message. You can specify the application to
- * use here, and also an optional query string to send.
- * @param string $application Optional application specifier.
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_querymsg($application="?", $host="", $port="") {
- $this->lucene_message("LuceneQueryRequest", $application, $host, $port);
- } // lucene_querymsg
- // .....................................................................
- /**
- * Set the query for this message. There can be only one query defined.
- * This method can be called repeatedly, and each time it is called the
- * new value will replace the old one.
- * @param string $query The query to submit to Lucene.
- */
- function set_query($query) {
- $queryxml = new xmltag("Query", $query);
- $queryxml->setattribute("default-field", DEFAULT_FIELD);
- $this->add_xmltag($queryxml);
- } // set_query
- // .....................................................................
- /**
- * Send the message to Lucene, and then post-process the response for
- * query hits. The hitcount is extracted, followed by the hits, which
- * may comprise multiple fields. A hit is thus defined as an array of
- * fields, and each hit is put into a single container array called
- * 'hit', which is a property of the parent class 'search'.
- * @param integer $timeoutsecs Override for timeout in seconds
- */
- function send($timeoutsecs="") {
- // Initialise flags..
- $this->sort_limit_exceeded = false;
- $this->sort_memory_exceeded = false;
- // Msg-level send-receive transaction..
- lucene_message::send($timeoutsecs);
- // Process the response to our request..
- $this->response = new queryresponse_parser();
- $this->response->parse($this->responsebuf);
- // Unpack the response if no errors..
- if ($this->response->valid) {
- // Here we will unpack the returned search query hits
- // and store them locally for use by child classes.
- $this->hitcount = (int)($this->response->count);
- if (isset($this->response->results)) {
- foreach ($this->response->results as $result) {
- $hit = array();
- $hit["RANK"] = $result->rank;
- foreach ($result->fields as $fieldname => $fieldvalue) {
- $hit[$fieldname] = $fieldvalue;
- }
- $this->hit[] = $hit;
- }
- }
- }
- else {
- // Check for sort limit/memory error conditions..
- if (stristr($this->response->error_message, "system sort limit")) {
- $this->sort_limit_exceeded = true;
- }
- if (stristr($this->response->error_message, "out of memory")) {
- $this->sort_memory_exceeded = true;
- }
- }
- } // send
- } // lucene_querymsg class
- // ----------------------------------------------------------------------
- /**
- * The lucene index message class. This class inherits all the functionality
- * of the lucene_connection, lucene_msg and lucene_message classes. It adds
- * indexing-specific methods.
- * @package search
- */
- class lucene_indexmsg extends lucene_message {
- // Public
- /** Indication that the indexing was successful */
- var $indexed = false;
- // Private
- /** A unique handle to identify the index
- response from Lucene
- @access private */
- var $serialno = "";
- // .....................................................................
- /** Constructor
- * Make a new Lucene index message.
- * @param string $application Optional application specifier
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_indexmsg($application="?", $host="", $port="") {
- $this->lucene_message("LuceneIndexRequest", $application, $host, $port);
- $this->serialno = md5(uniqid(""));
- $this->add_xmltag( new xmltag("Serial", $this->serialno) );
- $this->define_field(DEFAULT_FIELD, DEFAULT_FIELDTYPE, NOT_STORED);
- } // lucene_indexmsg
- // .....................................................................
- /**
- * Supply field content for indexing. This causes Lucene to take the given
- * fieldname and index the given value against it. NB: we silently ignore
- * the request for nullstring, since these cause Lucene indexing to throw
- * an exception, and indexing will fail.
- * The field name can have the field type included in the form 'Foo:Date',
- * where 'Date' is the type in this instance. In fact, since 'Text' is the
- * default filed type, 'Date' is probably the only one you need to use
- * as the current implementation stands.
- * @param string $fieldname Name of the field to index.
- * @param string $fieldvalue Content of the field to index
- */
- function index_field($fieldname, $fieldvalue) {
- if ($fieldvalue !== "") {
- $fieldname = $this->strip_field_type($fieldname);
- $this->add_field($fieldname, $fieldvalue);
- }
- } // index_field
- // .....................................................................
- /**
- * Index the given content against the given ID. This automatically
- * defines the default field called "Text", and the data added as a field
- * called "Text" as well. Attaches the "Body" tag to this field via a
- * call to add_data() method. Thus, the content is submitted as a raw
- * binary stream, rather than url-encoded text.
- * @param string $id The ID to associate with the given indexed data.
- * @param string $content The binary/text content to be indexed.
- */
- function index_content($id, $content) {
- if ($content !== "") {
- $this->add_xmltag( new xmltag("Id", $id) );
- $content = preg_replace("/[\n\r\t]/", " ", $content);
- $content = preg_replace("/[ ]{2,}/", " ", $content);
- $this->add_field(DEFAULT_FIELD, $content);
- }
- } // index_content
- // .....................................................................
- /**
- * Send the message to Lucene, and then post-process the response for
- * indication of a successful index operation. We expect to receive
- * a response back from Lucene which has our serialno in it. This method
- * returns True if the indexing was successful, else False.
- * @param integer $timeoutsecs Override for timeout in seconds
- * @return boolean True if indexing was successful.
- */
- function send($timeoutsecs="") {
- // Msg-level send-receive transaction..
- lucene_message::send($timeoutsecs);
- // Process the response to our request..
- $this->response = new response_parser();
- $this->response->parse($this->responsebuf);
- // Unpack the response if no errors..
- if ($this->response->valid) {
- $this->indexed = ($this->response->serial == $this->serialno);
- }
- // Return status of indexing operation..
- return $this->indexed;
- } // send
- } // lucene_indexmsg class
- // ----------------------------------------------------------------------
- /**
- * The lucene unindex message class. This class allows you to remove an
- * item from the Lucene index. You must know the unique ID that identifies
- * the document.
- * @package search
- */
- class lucene_unindexmsg extends lucene_message {
- // .....................................................................
- /** Constructor
- * Make a new Lucene unindex message. This message is provided to allow
- * you to delete an item from the Lucene index. It has a single method
- * 'unindex' which takes the ID of the item to delete.
- * @param string $application Optional application specifier
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_unindexmsg($application="?", $host="", $port="") {
- $this->lucene_message("LuceneUnIndexRequest", $application, $host, $port);
- } // lucene_unindexmsg
- // .....................................................................
- /**
- * Unindex the given document, as identified by the unique ID. If no errors
- * arise, then the item will be removed from the Lucene index.
- * @param string $id The ID to allow Lucene to identify the item to unindex
- */
- function unindex($id) {
- $this->add_xmltag( new xmltag("Id", $id) );
- } // unindex
- } // lucene_unindexmsg class
- // ----------------------------------------------------------------------
- /**
- * The lucene purge message class. This class allows you to remove all
- * items from the Lucene index. Take care!
- * @package search
- */
- class lucene_purgemsg extends lucene_message {
- // .....................................................................
- /** Constructor
- * Make a new Lucene purge message. This message is provided to allow
- * you to delete all items from the Lucene index.
- * @param string $application Optional application specifier
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_purgemsg($application="?", $host="", $port="") {
- $this->lucene_message("LuceneUnIndexRequest", $application, $host, $port);
- $this->add_xmltag( new xmltag("Purge") );
- } // lucene_purgemsg
- } // lucene_purgemsg class
- // ----------------------------------------------------------------------
- /**
- * The lucene utility message class. Used for special Lucene operations.
- * @package search
- */
- class lucene_utilitymsg extends lucene_message {
- /** Constructor
- * @param string $utilitycmd Command for this utility message.
- * @param string $application Optional application specifier
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_utilitymsg($utilitycmd="", $application="?", $host="", $port="") {
- $this->lucene_message("LuceneUtilityRequest", $application, $host, $port);
- if ($utilitycmd != "") {
- $this->add_xmltag( new xmltag("Utility", $utilitycmd) );
- }
- } // lucene_utilitymsg
- // .....................................................................
- /**
- * Send the message to Lucene, and then post-process the response for
- * indication of a successful utility operation. We expect to receive
- * a response back from Lucene which has nothing much it, unless there
- * has been an error.
- * returns True if the operation was successful, else False.
- * @param integer $timeoutsecs Override for timeout in seconds
- * @return boolean True if operation was successful.
- */
- function send($timeoutsecs="") {
- // Msg-level send-receive transaction..
- lucene_message::send($timeoutsecs);
- // Process the response to our request..
- $this->response = new response_parser();
- $this->response->parse($this->responsebuf);
- // Return status of indexing operation..
- return $this->response->valid;
- } // send
- } // lucene_utilitymsg class
- // ----------------------------------------------------------------------
- /**
- * The lucene search class
- * This class inherits the functionality of the generic 'search' class. It
- * extends it to implement a LUCENE search. Use the methods in this class
- * as the mainstay in implementing queries of content from Lucene. Most
- * methods, such as match(), matchfield(), matchrange() etc. store the
- * requirement in the class for subsequent building using the set_*()
- * methods of the lucene classes to set the relevant fields. This is only
- * done when you call execute(), and the query is built from all the
- * composite terms you have added via match() et al.
- * @package search
- */
- class lucene_search extends lucene_querymsg {
- // .....................................................................
- /**
- * Constructor
- * Create a new lucene search
- * @param string $application Application name/domain name for searching in
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_search($application="?", $host="", $port="") {
- $this->search();
- $this->lucene_querymsg($application, $host, $port);
- $this->initialise();
- } // lucene_search
- // .....................................................................
- /**
- * Add a new search term to match. Search terms can be a single word or
- * compound patterns, Each time one of these is added, it has an operator
- * associated with it - whether this term is a "may have" (OR), or a
- * "must have" (AND) term.
- * NB: This method overrides the parent method in order to ensure that all
- * boolean logic terms are in upper case as Lucene requires.
- * @param string $term Search term text to match.
- * @param integer $op Joining operator: 'AND', 'OR', 'NOT, 'AND NOT'.
- * @param string $id An optional ID to associate with this search term.
- * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
- */
- function match($term, $op="OR", $id="", $boost="") {
- $LCops = array("/ and /","/ or /","/ not /");
- $UCops = array(" AND "," OR "," NOT ");
- $term = preg_replace($LCops, $UCops, $term);
- if ($boost != "") $term .= "^$boost";
- search::match($term, strtoupper($op), $id);
- } // match
- // .....................................................................
- /**
- * Add search term to match a field value.
- * This is used to add a search term which defines the value that a given
- * field may or may not contain for the search to succeed.
- * For adding terms which are 'free' (as a user might type into a search
- * box for example) then you can use the match() method which this class
- * inherits from the search class.
- * @param string $fieldname Name of field to reference in the index
- * @param mixed $fieldvalue Value or array of values, for field to match
- * @param string $op Operator to join this term to others in the query
- * @param string $id Optional identity tag for this term
- * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
- */
- function matchfield($fieldname, $fieldvalue, $op="OR", $id="", $boost="") {
- debug_trace($this);
- if (!isset($fieldvalue)) return;
- if (!is_array ($fieldvalue)) {
- $fieldvalue = array($fieldvalue);
- }
- $term = "";
- foreach ($fieldvalue as $value) {
- $value = trim($value);
- if ($value != "") {
- $term .= " OR " . $this->fieldterm($fieldname, $value);
- }
- }
- if ($term != "") {
- $term = substr($term, 4); // Get rid of initial OR
- // Call parent function to register the search term..
- $this->match($term, strtoupper($op), $id, $boost);
- }
- debug_trace();
- } // matchfield
- // .....................................................................
- /**
- * Helper function to build field search term
- * @param string $fieldname Name of field to reference in the index
- * @param string $fieldvalue Value of field to match
- * @access private
- */
- function fieldterm($fieldname, $fieldvalue) {
- if ($fieldname != DEFAULT_FIELD) {
- $term = "$fieldname:$fieldvalue";
- }
- else {
- $term = $fieldvalue;
- }
- return $term;
- } // fieldterm
- // .....................................................................
- /**
- * Add search term to match a field value range.
- * This is used to add a search term which defines the range of values that
- * a given field may or may not contain for the search to succeed.
- * NB: This method is always a must match (implied AND) search term. In
- * other words the search is always restricted/refined by it.
- * @param string $fromvalue Lower range value of field to match
- * @param string $tovalue Upper range value of field to match
- * @param string $fieldname Name of field, defaulted to 'Text'
- */
- function matchrange($fromvalue, $tovalue, $fieldname) {
- debug_trace($this);
- $this->set_range($fromvalue, $tovalue, $fieldname);
- debug_trace();
- } // matchrange
- // .....................................................................
- /**
- * Add search term: must match a field value.
- * This is used to add a search term which defines the value that a given
- * field must contain for the search to succeed.
- * @param string $fieldname Name of field
- * @param string $fieldvalue Value of field to match
- * @param string $id Optional identity tag for this term
- * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
- */
- function must_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
- $this->matchfield($fieldname, $fieldvalue, "AND", $id, $boost);
- } // must_matchfield
- // .....................................................................
- /**
- * Add search term: may match a field value.
- * This is used to add a search term which defines the value that a given
- * field may contain for the search to succeed.
- * @param string $fieldname Name of field
- * @param string $fieldvalue Value of field to match
- * @param string $id Optional identity tag for this term
- * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
- */
- function may_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
- $this->matchfield($fieldname, $fieldvalue, "OR", $id, $boost);
- } // may_matchfield
- // .....................................................................
- /**
- * Add search term: must not match a field value.
- * This is used to add a search term which defines the value that a given
- * field must not contain for the search to succeed.
- * @param string $fieldname Name of field
- * @param string $fieldvalue Value of field to match
- * @param string $id Optional identity tag for this term
- * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
- */
- function does_not_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
- $this->matchfield($fieldname, $fieldvalue, "NOT", $id, $boost);
- } // does_not_matchfield
- // .....................................................................
- /**
- * Execute the search
- * Here we execute a lucene search, overriding the method in the parent
- * class. This involves building the query string, sending it to the
- * Lucene server, and receiving the search results back.
- * @param integer $timeoutsecs Override for timeout in seconds
- */
- function execute($timeoutsecs="") {
- debug_trace($this);
- // The queryvalid() method is in the parent class 'search', and
- // calls the build() method in the same class. The build() method is
- // a raw routine to join together the search terms with ANDs and
- // ORs. You may have to override it for Lucene. If so, just create
- // a new build() method in this class.
- if ($this->queryvalid()) {
- // Define the query string..
- $this->set_query($this->query);
- // Set limit, offset..
- if ($this->max_results > 0) {
- $this->set_limit($this->max_results);
- if ($this->skip_results > 0) {
- $this->set_first($this->skip_results);
- }
- }
- // Set any daterange..
- if ($this->has_daterange()) {
- $this->set_range($this->date_start, $this->date_end, $this->date_fieldname);
- }
- // Send to Lucene..
- $this->send($timeoutsecs);
- // Flag that we did it..
- $this->executed = true;
- debugbr("lucene search: exec ok: returning " . $this->hitcount() . " hits");
- }
- else {
- debugbr("lucene search: invalid query: '$this->query'");
- }
- debug_trace();
- } // execute
- } // lucene_search class
- // ----------------------------------------------------------------------
- /**
- * The lucene file indexer class.
- * This class indexes files on disc, either one by one or as a whole
- * file hierarchy tree.
- * @package search
- */
- class lucene_fileindexer {
- // Public
- /** Application we are indexing for */
- var $application = "";
- /** Host to connect to */
- var $host = "";
- /** Port to connect to */
- var $port = "";
- // Private
- /** The index ID
- @access private */
- var $ixid;
- /** ID generation source
- @access private */
- var $idsource = ID_FROM_INC;
- /** Scan for meta tags as fields in file content. Recommended.
- @access private */
- var $metascan = true;
- /** Meta fields definitions array. Contains definitions
- for the fields we will process if found as meta tags.
- @access private */
- var $meta_fields = array();
- /** Index fields definitions array. Contains definitions
- for the fields we are expecting to index.
- @access private */
- var $field_definitions = array();
- /** Fields for indexing. This is an array of fieldname/value
- pairs which should be added during the indexing. These
- fields do not have to appear in $field_definitions.
- @access private */
- var $indexfields = array();
- /** ID generation offset
- @access private */
- var $idoffset = 0;
- /** ID generation prefix
- @access private */
- var $idprefix = "";
- /** The index object which does the work
- @access private */
- var $lucene_indexer;
- /** Timeout for indexing commands in seconds (can usually leave
- as nullstring)
- @access private */
- var $timeoutsecs = "";
- /** Path to a lockfile we should give way to. If this value
- is not nullstring, then no indexing will be done while the
- file exists. If lockfile_wait is > 0, then we only wait
- this many seconds.
- @access private */
- var $lockfile = "";
- /** Number of seconds to wait on a lockfile. If zero, wait forever.
- @access private */
- var $lockfile_wait_secs = 0;
- /** Indexing execution timer
- @access private */
- var $timer;
- // .....................................................................
- /**
- * Constructor
- * Create a new lucene indexer
- * @param string $application Application name
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- */
- function lucene_fileindexer($application="?", $host="", $port="") {
- // Store for reference..
- $this->application = $application;
- $this->host = $host;
- $this->port = $port;
- $this->timer = new microtimer();
- } // lucene_fileindexer
- // .....................................................................
- /**
- * Define a field. We supply the name of the field, it's type (Text, Date
- * or Id), and whether it should be stored by Lucene for later retreival in
- * queries. For example you would not store the raw document/content as this
- * is usually stored elsewhere.
- * IMPORTANT NOTE: Fields defined here will automatically be included as
- * meta fields.
- * @see meta_fields()
- * @param string $fieldname Name of the field to index
- * @param string $type Type of field data: Text, Date or Id.
- * @param boolean $stored If true then Lucene will store the content itself
- * @param boolean $indexed If true then Lucene will index the field content
- */
- function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
- $this->field_definitions[$fieldname]
- = $type . "|" . (($stored) ? "true" : "false") . "|" . (($indexed) ? "true" : "false");
- // Register for meta tags..
- $this->meta_field($fieldname, $type);
- } // define_field
- // .....................................................................
- /**
- * Define a lockfile which we must avoid during indexing. If defined
- * then no indexing will take place while the lockfile exists. The
- * second parameter allows you to specify a limit to the patience of
- * this process, in seconds. Zero means wait forever.
- * @param string $lockfile Path to the lockfile. Nullstring = not defined
- * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
- */
- function avoid_lockfile($lockfile, $wait_secs=0) {
- $this->lockfile = $lockfile;
- $this->lockfile_wait_secs = $wait_secs;
- } // avoid_lockfile
- // .....................................................................
- /**
- * Define a field as a meta tag. This ensures that the field will be
- * picked up from the file meta tags, if present. If it is not listed
- * here then it will be ignored.
- * IMPORTANT NOTE: We define the strict rule that ONLY fields which have
- * been defined here can be added to the indexing via the meta tag scanning.
- * Ie. you must define fields here explicitly, or via the define_field()
- * method, or they will be ignored even if they turn up as a meta tag.
- * This is so we can restrict the indexing, and be sure of field types.
- * @see define_field()
- * @param string $fieldname Name of the field to process as meta tag
- * @param string $type Type of field data: Text, Date or Id.
- */
- function meta_field($fieldname, $type) {
- $this->meta_fields[$fieldname] = $type;
- } // meta_field
- // .....................................................................
- /**
- * Supply field content for indexing. This causes Lucene to take the given
- * fieldname and index the given value against it.
- * The field name can have the field type included in the form 'Foo:Date',
- * where 'Date' is the type in this instance. In fact, since 'Text' is the
- * default filed type, 'Date' is probably the only one you need to use
- * as the current implementation stands.
- * @param string $fieldname Name of the field to index.
- * @param string $fieldvalue Content of the field to index
- */
- function index_field($fieldname, $fieldvalue) {
- $this->indexfields[$fieldname] = $fieldvalue;
- } // index_field
- // .....................................................................
- /**
- * Set the source for ID generation. Since we are indexing a bunch of
- * files, the ID's have to be generated on demand inside the loop. So
- * we provide for various ways here, and you can extend this class to
- * provide more if required.
- * Main ways:
- * ID_FROM_INC Increment a counter by 1 each time (with offset)
- * ID_FROM_NAME Take the filename, strip the extension, add prefix
- * ID_FROM_FILENAME Take the full filename, add prefix
- * ID_FROM_PATH Take the full file path
- * NB: These are all defined as integer constants.
- * @param integer $idsource Source of ID generation
- * @param mixed $pfxofs String prefix, or integer offset
- */
- function id_generate($idsource=ID_FROM_INC, $pfxofs="") {
- $this->idsource = $idsource;
- if ($pfxofs != "") {
- if (is_string($pfxofs)) {
- $this->idprefix = $pfxofs;
- }
- else {
- $this->idoffset = (int)$pfxofs;
- }
- }
- } // id_generate
- // .....................................................................
- /**
- * Flag that we should do a tag scan on the content of the files to try
- * and extract fields to index. Note that any tags thus found will only
- * be used if the field name has been defined with the method define_field();
- * This causes both the <title> tag and <meta> tags to be considered.
- * @see lucene_fileindexer::define_field()
- */
- function scantags() {
- $this->metascan = true;
- } // scantags
- // .....................................................................
- /**
- * Flag that we should NOT do a tag scan on the content of the files.
- */
- function noscantags() {
- $this->metascan = false;
- } // noscantags
- // .....................................................................
- /**
- * Index a file located at the given path, using given ID.
- * You can also use the parameter $fields to supply an array of
- * fieldname/value pairs to index with this file, for one-off indexing of
- * files. If the fieldname is a date field, make sure to define the
- * name as 'Foo:Date', to cause the field definition to be correct.
- * @param string $path Path to the head of the file tree to index
- * @param string $id ID to associate with the indexed file content
- * @param mixed $fields Array of field/values to index with file
- */
- function index_file($path, $id, $fields=false) {
- $success = false;
- $f = new inputfile($path);
- if ($f->opened) {
- $f->readall();
- $f->closefile();
- // Wait for a lockfile, if we really have to..
- if ($this->lockfile != "" && file_exists($this->lockfile)) {
- $waitforit = true;
- debugbr("waiting for lockfile..", DBG_DEBUG);
- if ($this->lockfile_wait_secs > 0) {
- $locktimer = new microtimer();
- $locktimer->start();
- }
- do {
- clearstatcache();
- if (!file_exists($this->lockfile)) {
- $waitforit = false;
- debugbr("lockfile has been removed..", DBG_DEBUG);
- }
- elseif ($this->lockfile_wait_secs > 0 && $locktimer->secs() >= $this->lockfile_wait_secs) {
- $waitforit = false;
- debugbr("lockfile wait (" . $this->lockfile_wait_secs ."secs) timed out..", DBG_DEBUG);
- }
- else {
- sleep(1);
- }
- } while ($waitforit === true);
- }
- // Create the index message..
- $ix = new lucene_indexmsg($this->application, $this->host, $this->port);
- // Define the fields for the index message..
- foreach ($this->field_definitions as $fieldname => $attributes) {
- $bits = explode("|", $attributes);
- $type = $bits[0];
- $stored = (strcasecmp($bits[1], "true") == 0);
- $indexed = (strcasecmp($bits[2], "true") == 0);
- $ix->define_field($fieldname, $type, $stored, $indexed);
- }
- // Scan file content for meta tags for index fields..
- $content = preg_replace("/[\xe2][\x80]./", "", $f->content);
- $content = preg_replace("/[\xc2][\xb7]./", "", $content);
- $content = preg_replace("/[\xc2]&/", " ", $content);
- $content = preg_replace("/[\xc3]&/", " ", $content);
- if ($this->metascan) {
- $tagpat = "/<meta name=\"(.*?)\" content=\"(.*?)\">/i";
- $matches = array();
- if (preg_match_all($tagpat, $content, $matches)) {
- for ($i=0; $i < count($matches[0]); $i++) {
- $fieldname = $matches[1][$i];
- $fieldvalue = $matches[2][$i];
- if (isset($this->meta_fields[$fieldname])) {
- // Get type..
- $type = $this->meta_fields[$fieldname];
- if (!strcasecmp($type, "date")) {
- // Newsquest date field format requires stripping off a prefix
- // 'DT' - a temporary hack which should be completely transparent
- // to everyone else using this. NB: originally NewsQuest only
- // stored date in 'DTdd/mm/yyyy' format. This parsing is also
- // compatible with the new 'DTdd/mm/yyyy hh:mm[:ss]' format.
- if (substr($fieldvalue, 0, 2) == "DT") {
- $fieldvalue = substr($fieldvalue, 2);
- }
- // Need to convert to Unix timestamp..
- $ts = displaydate_to_timestamp($fieldvalue);
- $fieldvalue = $ts;
- }
- debugbr("meta tag index field: $fieldname=$fieldvalue");
- $ix->index_field($fieldname, $fieldvalue);
- }
- else {
- debugbr("rejected unlisted tag field: $fieldname");
- }
- }
- }
- // Check for title tag in HTML page if required field..
- if (preg_match("/<(title)>(.*?)<\/title>/i", $content, $matches)) {
- $fieldname = $matches[1];
- $fieldvalue = $matches[2];
- if (isset($this->meta_fields[$fieldname])) {
- $type = $this->meta_fields[$fieldname];
- debugbr("title tag index field: $fieldname=$fieldvalue");
- $ix->index_field($fieldname, $fieldvalue);
- }
- }
- } // metascan
- // Deal with passed-in field settings. These are meant to cater
- // for indexing of individual files using this method. We just
- // add them to any existing field/values already set up..
- if ($fields) {
- reset($fields);
- while (list($fieldname, $fieldvalue) = each($fields)) {
- $this->index_field($fieldname, $fieldvalue);
- }
- }
- // Process field/value pairs which have been added either by the
- // index_field() method, or passed in via the $fields parameter..
- if (count($this->indexfields) > 0) {
- reset($this->indexfields);
- while (list($fieldname, $fieldvalue) = each($this->indexfields)) {
- $bits = explode(":", $fieldname);
- $type = ((isset($bits[1])) ? $bits[1] : "Text");
- $fieldname = $bits[0];
- debugbr("index field: $fieldname=$fieldvalue");
- $ix->define_field($fieldname, $type);
- $ix->index_field($fieldname, $fieldvalue);
- }
- }
- // Index the file content. We get rid of any HTML tags..
- debugbr("indexing file: $path, ID=$id");
- $ix->index_content($id, strip_tags($content));
- // Send the index message to lucene. We specify a large
- // timeout since we really want this to succeed and Lucene
- // may be in an optimization fugue..
- $success = $ix->send(120);
- if(!$success) {
- debugbr("failed: $ix->error_msg");
- }
- }
- else {
- debugbr("open failed on '$path'");
- }
- return $success;
- } // index_file
- // .....................................................................
- /**
- * Index a tree of files starting at the path given. We index these in one
- * of four modes, which determines how we generate the ID for each item:
- * 'ID_FROM_INC' mode uses an incremental counter starting at 1. If $prefix
- * holds a number, the counter will start at this number instead of one.
- * Each item has an ID incremented by one from the last one.
- * 'ID_FROM_NAME' mode uses the filename, stripped of any path and extension
- * as the ID. If prefix is not a nullstring, then it is prefixed to every
- * filename ID.
- * 'ID_FROM_FILENAME' mode uses the filename, including any extension
- * as the ID. If prefix is not a nullstring, then it is prefixed to every
- * filename ID.
- * 'ID_FROM_PATH' mode uses the full path to the item being indexed as the
- * ID. If prefix is not a nullstring, then it is prefixed to every
- * filename ID.
- * The file will simply be indexed as a single Text field, with the
- * appropriate ID, and no other index fields unless $metascan is set to TRUE.
- * If this is the case, the system will scan the file for HTML meta tags of
- * form: '<meta name="foo" content="bar">'. In this example a field of name
- *'foo' would be given value 'bar'.
- * @param string $path Path to the head of the file tree to index
- * @param $patt Pattern to match, eg. '*.html'
- * @param $restart If equal to "restart" then treat $path as file of paths
- * @param $lockfile If path is set, we idle whilst this file exists
- * @param string $lockfile Path to the lockfile. Nullstring = not defined
- * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
- */
- function index_tree($path, $patt="", $restart="", $lockfile="", $wait_secs=0) {
- // Set up any lockfile definition..
- $this->avoid_lockfile($lockfile, $lockfile_wait_secs);
- if ($restart == "restart") {
- // Restart from existing paths file..
- $tmpfname = $path;
- debugbr("restarting with existing item list $path", DBG_DEBUG);
- }
- else {
- // Use find to generate item list to a temporary file..
- debugbr("generating item list", DBG_DEBUG);
- $tmpfname = tempnam("/tmp", "LU");
- $cmd = "find $path";
- if ($patt != "") $cmd .= " -name \"$patt\"";
- $cmd .= " >$tmpfname";
- exec($cmd);
- }
- $treelist = new inputfile($tmpfname);
- if ($treelist->opened) {
- // Find the number of items..
- debugbr("counting items", DBG_DEBUG);
- $todo = (int) exec("cat $tmpfname|wc -l");
- if ($todo > 0) {
- $done = 0; $succeeded = 0; $failed = 0; $last = 0;
- debugbr("$todo items to index", DBG_DEBUG);
- $this->timer->start();
- $idix = 0;
- if ($this->idsource == ID_FROM_INC) {
- $idix += $this->idoffset;
- }
- while ($path = $treelist->readln()) {
- // Generate an ID to use..
- switch ($this->idsource) {
- case ID_FROM_INC:
- // Use incremented index..
- $id = $idix + 1;
- $idix += 1;
- break;
- case ID_FROM_NAME:
- // Use filename, minus extenaion..
- $fname = basename($path);
- if (strstr($fname, ".")) {
- $bits = explode(".", $fname);
- $dummy = array_pop($bits);
- $fname = implode(".", $bits);
- }
- $id = $this->idprefix . $fname;
- break;
- case ID_FROM_FILENAME:
- // Use full filename..
- $id = $this->idprefix . basename($path);
- break;
- case ID_FROM_PATH:
- // Use full file path..
- $id = $this->idprefix . $path;
- break;
- } // switch
- // Index the file with new ID..
- if ($this->index_file($path, $id)) {
- debugbr("$id indexed", DBG_DEBUG);
- $succeeded += 1;
- }
- else {
- debugbr("$path index failed", DBG_DEBUG);
- //break;
- $failed += 1;
- }
- // Progress check..
- $done += 1;
- // If the verbose output option is enabled, we compile
- // stats and display these via the debugger..
- if (debugging()) {
- $pct = ($done / $todo) * 100;
- $pct_int = (int)(floor($pct));
- $pct_mod = $pct % 5;
- if ($pct_mod == 0 && $pct_int > $last) {
- $secperdoc = $this->timer->secs() / $done;
- $timedone = $this->timer->formatted_time();
- $timeleft = nicetime(($todo - $done) * $secperdoc);
- $ms = $this->timer->millisecs();
- $msper = number_format( ($ms / $done), 0);
- debugbr("Mark: $pct_int% $timedone ($done) Rate:$msper" . "ms/item Left:$timeleft", DBG_DEBUG);
- $last = $pct_int;
- }
- }
- } // while
- // Close tree list file..
- $treelist->closefile();
- // Wrap it up..
- $this->timer->stop();
- // Final stats if verbose mode..
- if (debugging()) {
- $secs = $this->timer->secs();
- $msper = number_format( (1000 * $secs / $todo), 2);
- $sper1000 = number_format( ($secs / $todo) * 1000, 2);
- debugbr("time taken per item: " . $msper . "msec", DBG_DEBUG);
- debugbr("time per 1000 items: " . nicetime($sper1000), DBG_DEBUG);
- debugbr("total time taken: " . $this->timer->formatted_time(), DBG_DEBUG);
- debugbr("successfully indexed: $succeeded", DBG_DEBUG);
- debugbr("indexing failures: $failed", DBG_DEBUG);
- }
- }
- else {
- debugbr("nothing to index", DBG_DEBUG);
- }
- }
- else {
- debugbr("failed to open $tmpfname", DBG_DEBUG);
- }
- } // index_tree
- } // lucene_fileindexer class
- // ----------------------------------------------------------------------
- /**
- * Function to optimize the Lucene index. This would commonly
- * be used after a batch of items have been indexed.
- * @param string $application Application name/domain name for searching in
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- * @return boolean True if the operation was successful.
- */
- function lucene_optimize($application="?", $host="", $port="") {
- $optimizer = new lucene_utilitymsg("OPTIMIZE", $application, $host, $port);
- $optimizer->send(SOCK_FOREVER);
- return $optimizer->response->valid;
- } // lucene_optimize
- // ----------------------------------------------------------------------
- /**
- * Function to make a backup of the Lucene index. This would commonly
- * be used after a batch of items have been successfully optimized (which
- * indicates a sound index). The backup will be made to the directory
- * specified in the application .properties file as the property
- * 'Lucene-Backup-Directory=' or, if not there then in the Lucene properties
- * file 'Server.properties' as the same property. If neither of these are
- * defined, the server will attempt to use a sub-directory called
- * {Lucene-Index-Directory}_backup, where {Lucene-Index-Directory} is the
- * index path as already defined in the 'Server.properties' file.
- * @param string $application Application name/domain name for searching in
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- * @return boolean True if the operation was successful.
- */
- function lucene_backup($application="?", $host="", $port="") {
- $backup = new lucene_utilitymsg("BACKUP", $application, $host, $port);
- $backup->send(SOCK_FOREVER);
- return $backup->response->valid;
- } // lucene_backup
- // ----------------------------------------------------------------------
- /**
- * Function to purge the Lucene index of all indexes to documents. Yes,
- * I'll repeat that - it DELETES ALL DOCUMENTS FROM THE INDEX, permanently,
- * finito, shazam, ba-boom, as in "Omigod did I *really* mean to do that!?".
- * I guess I don't have to warn you to be careful with this, do I?
- * @param string $application Application name/domain name for searching in
- * @param string $host Hostname or IP of Lucene server
- * @param string $port Port of Lucene server
- * @return boolean True if the purging operation was successful.
- */
- function lucene_purge($application="?", $host="", $port="") {
- $purgative = new lucene_purgemsg($application, $host, $port);
- $purgative->send(SOCK_FOREVER);
- return $purgative->response->valid;
- } // lucene_purge
- // ----------------------------------------------------------------------
- ?>
Documentation generated by phpDocumentor 1.3.0RC3