openpkg-packages/htdig/htdig.patch

Index: htdig/ExternalParser.cc
--- htdig/ExternalParser.cc.orig	2004-05-28 15:15:14 +0200
+++ htdig/ExternalParser.cc	2006-06-17 22:25:21 +0200
@@ -227,7 +227,7 @@
     int		get_file = (convertToType.length() != 0);
     String	newcontent;

-    StringList	cpargs(currentParser);
+    StringList	cpargs(currentParser, " \t");
     char   **parsargs = new char * [cpargs.Count() + 5];
     int    argi;
     for (argi = 0; argi < cpargs.Count(); argi++)
@@ -424,7 +424,7 @@
 		  {
 			metadatetags = new StringMatch();
 			metadatetags->IgnoreCase();
-			metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified");
+			metadatetags->Pattern("date|dc.date|dc.date.created|dc.date.modified");
 		  }

 		  // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5>
Index: htnet/HtHTTP.cc
--- htnet/HtHTTP.cc.orig	2004-05-28 15:15:23 +0200
+++ htnet/HtHTTP.cc	2006-06-17 22:25:21 +0200
@@ -643,6 +643,8 @@
     String	line = 0;
     int		inHeader = 1;

+    _needUTF8Convert = 0;
+
     if (_response._modification_time)
     {
 	delete _response._modification_time;
@@ -731,7 +733,15 @@
             token = strtok(token, "\n\t");

             if (token && *token)
+            {
                _response._content_type = token;
+               if ((_response._content_type.indexOf("text/html") != -1) && (_response._content_type.indexOf("UTF-8") != -1))
+               {
+                  if ( debug > 4 )
+                     cout << "needUTF8Convert flagged" << endl;
+                  _needUTF8Convert = 1;
+               }
+            }

          }
          else if( ! mystrncasecmp((char*)line, "content-length:", 15))
@@ -970,6 +980,31 @@

     }

+    if ( _needUTF8Convert )
+    {
+        if ( debug > 4 )
+            cout << "Converting UTF-8 characters" << endl;
+
+        char *srcPtr, *dstPtr;
+        srcPtr = dstPtr = _response._contents.get();
+        while ( *srcPtr )
+        {
+            if ( ( *srcPtr & 0x80 ) == 0 )
+                *dstPtr++ = *srcPtr++;
+            else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) {
+                *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F)  ) & 0xFF;
+                srcPtr += 2;
+            } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) {
+                *dstPtr++ = '?';
+                srcPtr += 3;
+            } else {
+                *dstPtr++ = '?';
+                srcPtr += 4;
+            }
+        }
+        *dstPtr = 0;
+    }
+
     // Set document length
     _response._document_length = _response._contents.length();

Index: htnet/HtHTTP.h
--- htnet/HtHTTP.h.orig	2004-05-28 15:15:23 +0200
+++ htnet/HtHTTP.h	2006-06-17 22:25:21 +0200
@@ -316,6 +316,7 @@
    int      	_bytes_read;        // Bytes read
    URL		_url;               // URL to retrieve
    URL		_referer;	    // Referring URL
+   int		_needUTF8Convert;   // Flag for simple UTF-8 convert

    String      _accept_language;    // accept-language directive

Index: htnet/SSLConnection.cc
--- htnet/SSLConnection.cc.orig	2004-05-28 15:15:23 +0200
+++ htnet/SSLConnection.cc	2006-06-17 22:25:21 +0200
@@ -131,6 +131,7 @@
     {
       errno = 0;

+      if (!SSL_pending(ssl)) {
       if (timeout_value > 0) {
           FD_SET_T fds;
           FD_ZERO(&fds);
@@ -144,6 +145,7 @@
           if (selected <= 0)
               need_io_stop++;
       }
+      }

       if (!need_io_stop)
           count = SSL_read(ssl, buffer, maxlength);
Index: htsearch/Collection.h
--- htsearch/Collection.h.orig	2004-05-28 15:15:24 +0200
+++ htsearch/Collection.h	2006-06-17 22:28:24 +0200
@@ -36,9 +36,9 @@
                const char *docExcerpt);
     ~Collection();

-    void Collection::Open();
+    void Open();

-    void Collection::Close();
+    void Close();

     char *getWordFile() { return wordFile.get(); }
     DocumentRef         *getDocumentRef(int id);
Index: htsearch/Display.cc
--- htsearch/Display.cc.orig	2004-05-28 15:15:24 +0200
+++ htsearch/Display.cc	2006-06-17 22:25:21 +0200
@@ -362,7 +362,7 @@

     if (maxScore != 0 && maxScore != minScore)
       {
-	int percent = (int)((ref->DocScore() - minScore) * 100 /
+	int percent = (int)((ref->DocScore() - minScore) * 100.0 /
 			    (maxScore - minScore));
 	if (percent <= 0)
 	  percent = 1;
@@ -694,6 +694,38 @@
     //
     if (nPages > 1)
     {
+	// Assume number of page links is equal to maximum_page_buttons
+	// For example, if pageNumber=9, maximum_page_buttons=10,
+	// and nPages>=13, we get:
+	//
+	// [prev] 4 5 6 7 8 9 10 11 12 13 [next]
+
+	int nPageButtons = config->Value("maximum_page_buttons", 10);
+
+	// Initialize indexes of pages links
+	int first_page_index = 1;
+	int last_page_index  = nPages;
+
+	if (nPages > nPageButtons)
+	{
+	   // Try to center the current page
+	   int links_on_the_left = nPageButtons/2;
+	   first_page_index = pageNumber - links_on_the_left;
+	   last_page_index  = first_page_index + nPageButtons - 1;
+
+	   // Adjust if required
+	   if (first_page_index < 1)
+	   {
+	      first_page_index = 1;
+	      last_page_index  = nPageButtons;
+	   }
+	   else if (last_page_index > nPages )
+	   {
+	      last_page_index  = nPages;
+	      first_page_index = nPages - nPageButtons + 1;
+	   }
+	}
+
 	if (pageNumber > 1)
 	{
 	    str = new String("<a href=\"");
@@ -725,9 +757,8 @@
 	QuotedStringList	pnt(config->Find("page_number_text"), " \t\r\n");
 	QuotedStringList	npnt(config->Find("no_page_number_text"), " \t\r\n");
 	QuotedStringList	sep(config->Find("page_number_separator"), " \t\r\n");
-	if (nPages > config->Value("maximum_page_buttons", 10))
-	    nPages = config->Value("maximum_page_buttons", 10);
-	for (i = 1; i <= nPages; i++)
+
+	for (i = first_page_index; i <= last_page_index; i++)
 	{
 	    if (i == pageNumber)
 	    {
Index: httools/htmerge.cc
--- httools/htmerge.cc.orig	2004-05-28 15:15:25 +0200
+++ httools/htmerge.cc	2006-06-17 22:25:21 +0200
@@ -191,6 +191,64 @@
     return 0;
 }

+// Declare a record for storing callback data
+class CallbackData : public Object
+{
+public:
+  CallbackData(HtWordList * w, Dictionary * d, int o)
+   { word_db = w; dup_ids = d; docIDOffset = o; }
+
+  HtWordList * word_db;
+  Dictionary * dup_ids;
+  int docIDOffset;
+};
+
+
+//*****************************************************************************
+// int OverrideCallback(WordList * wl, WordDBCursor &,
+//                      const WordReference * w, Object & d )
+//
+int
+OverrideCallback(WordList * wl,
+                 WordDBCursor &,
+                 const WordReference * w,
+                 Object & d)
+{
+  CallbackData & data = ((CallbackData &)d);
+  HtWordReference * ht_wr = (HtWordReference *)w;
+  String docIDKey;
+
+  docIDKey << ht_wr->DocID();
+  if (!((data.dup_ids)->Exists(docIDKey)))
+    {
+     ht_wr->DocID(ht_wr->DocID() + data.docIDOffset);
+     (data.word_db)->Override(*ht_wr);
+    }
+
+  return OK;
+}
+
+//*****************************************************************************
+// int DeleteCallback(WordList * wl, WordDBCursor &,
+//                    const WordReference * w, Object & d )
+//
+int
+DeleteCallback(WordList * wl,
+               WordDBCursor &,
+               const WordReference * w,
+               Object & d)
+{
+  CallbackData & data = ((CallbackData &)d);
+  HtWordReference * ht_wr = (HtWordReference *)w;
+  String docIDKey;
+
+  docIDKey << ht_wr->DocID();
+  if ((data.dup_ids)->Exists(docIDKey))
+    (data.word_db)->Delete(*ht_wr);
+
+  return OK;
+}
+
 //*****************************************************************************
 // void mergeDB()
 //
@@ -316,8 +374,6 @@

     // OK, after merging the doc DBs, we do the same for the words
     HtWordList	mergeWordDB(*config), wordDB(*config);
-    List	*words;
-    String	docIDKey;

     if (wordDB.Open(config->Find("word_db"), O_RDWR) < 0)
     {
@@ -332,33 +388,24 @@
     }

     // Start the merging by going through all the URLs that are in
-    // the database to be merged
-
-    words = mergeWordDB.WordRefs();
+    // the database to be merged
+    WordCursor    *mergeCursor;
+    WordKey        empty;

-    words->Start_Get();
-    HtWordReference   *word;
-    while ((word = (HtWordReference *) words->Get_Next()))
     {
-      docIDKey = word->DocID();
-      if (merge_dup_ids.Exists(docIDKey))
-      continue;
-
-      word->DocID(word->DocID() + docIDOffset);
-      wordDB.Override(*word);
+      CallbackData data(&wordDB, &merge_dup_ids, docIDOffset);
+      mergeCursor = mergeWordDB.Cursor(empty, OverrideCallback, (Object *)&data);
+      mergeCursor->Walk();
+      delete mergeCursor;
     }
-    delete words;

-    words = wordDB.WordRefs();
-    words->Start_Get();
-    while ((word = (HtWordReference *) words->Get_Next()))
     {
-      docIDKey = word->DocID();
-      if (db_dup_ids.Exists(docIDKey))
-      wordDB.Delete(*word);
+      CallbackData data(&wordDB, &db_dup_ids, 0);
+      mergeCursor = wordDB.Cursor(empty,DeleteCallback, (Object *)&data);
+      mergeCursor->Walk();
+      delete mergeCursor;
     }
-    delete words;
-
+
     // Cleanup--just close the two word databases
     mergeWordDB.Close();
     wordDB.Close();
Index: installdir/htdig.conf
--- installdir/htdig.conf.orig	2004-02-08 11:19:33 +0100
+++ installdir/htdig.conf	2006-06-17 22:25:21 +0200
@@ -47,7 +47,7 @@
 # long list of URLs, it may be wise to replace it with something like
 # http://www.  or comment this out and use the compiled-in default.
 #
-common_url_parts:	${limit_urls_to} .html .htm .shtml
+common_url_parts:	${limit_urls_to} .html .htm .shtml .php

 #
 # If there are particular pages that you definitely do NOT want to index, you
@@ -70,7 +70,7 @@
 # actual strings.
 #
 bad_extensions:		.wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif \
-	.jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css
+	.jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css .js .png .ico

 #
 # The string htdig will send in every request to identify the robot.  Change
Index: installdir/rundig
--- installdir/rundig.orig	2003-12-29 09:49:05 +0100
+++ installdir/rundig	2006-06-17 22:25:21 +0200
@@ -30,7 +30,6 @@
 done

 # If -a specified, note the database directory to move the temp files correctly
-# TODO: Should also check for files relative to COMMONDIR.
 if [ -f "$conffile" ]
 then
     new_db_dir=`awk '/^[^#a-zA-Z]*database_dir/ { print $NF }' < $conffile`
@@ -38,6 +37,11 @@
     then
 	DBDIR=$new_db_dir
     fi
+    new_dir=`awk '/^[^#a-zA-Z]*common_dir/ { print $NF }' < $conffile`
+    if [ "$new_dir" != "" ]
+    then
+	COMMONDIR=$new_dir
+    fi
 else
     echo "Config file $conffile cannot be found"
     exit 1