Index: htdig/ExternalParser.cc --- htdig/ExternalParser.cc.orig 2004-05-28 15:15:14 +0200 +++ htdig/ExternalParser.cc 2006-06-17 22:25:21 +0200 @@ -227,7 +227,7 @@ int get_file = (convertToType.length() != 0); String newcontent; - StringList cpargs(currentParser); + StringList cpargs(currentParser, " \t"); char **parsargs = new char * [cpargs.Count() + 5]; int argi; for (argi = 0; argi < cpargs.Count(); argi++) @@ -424,7 +424,7 @@ { metadatetags = new StringMatch(); metadatetags->IgnoreCase(); - metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified"); + metadatetags->Pattern("date|dc.date|dc.date.created|dc.date.modified"); } // Index: htnet/HtHTTP.cc --- htnet/HtHTTP.cc.orig 2004-05-28 15:15:23 +0200 +++ htnet/HtHTTP.cc 2006-06-17 22:25:21 +0200 @@ -643,6 +643,8 @@ String line = 0; int inHeader = 1; + _needUTF8Convert = 0; + if (_response._modification_time) { delete _response._modification_time; @@ -731,7 +733,15 @@ token = strtok(token, "\n\t"); if (token && *token) + { _response._content_type = token; + if ((_response._content_type.indexOf("text/html") != -1) && (_response._content_type.indexOf("UTF-8") != -1)) + { + if ( debug > 4 ) + cout << "needUTF8Convert flagged" << endl; + _needUTF8Convert = 1; + } + } } else if( ! mystrncasecmp((char*)line, "content-length:", 15)) @@ -970,6 +980,31 @@ } + if ( _needUTF8Convert ) + { + if ( debug > 4 ) + cout << "Converting UTF-8 characters" << endl; + + char *srcPtr, *dstPtr; + srcPtr = dstPtr = _response._contents.get(); + while ( *srcPtr ) + { + if ( ( *srcPtr & 0x80 ) == 0 ) + *dstPtr++ = *srcPtr++; + else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) { + *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F) ) & 0xFF; + srcPtr += 2; + } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) { + *dstPtr++ = '?'; + srcPtr += 3; + } else { + *dstPtr++ = '?'; + srcPtr += 4; + } + } + *dstPtr = 0; + } + // Set document length _response._document_length = _response._contents.length(); Index: htnet/HtHTTP.h --- htnet/HtHTTP.h.orig 2004-05-28 15:15:23 +0200 +++ htnet/HtHTTP.h 2006-06-17 22:25:21 +0200 @@ -316,6 +316,7 @@ int _bytes_read; // Bytes read URL _url; // URL to retrieve URL _referer; // Referring URL + int _needUTF8Convert; // Flag for simple UTF-8 convert String _accept_language; // accept-language directive Index: htnet/SSLConnection.cc --- htnet/SSLConnection.cc.orig 2004-05-28 15:15:23 +0200 +++ htnet/SSLConnection.cc 2006-06-17 22:25:21 +0200 @@ -131,6 +131,7 @@ { errno = 0; + if (!SSL_pending(ssl)) { if (timeout_value > 0) { FD_SET_T fds; FD_ZERO(&fds); @@ -144,6 +145,7 @@ if (selected <= 0) need_io_stop++; } + } if (!need_io_stop) count = SSL_read(ssl, buffer, maxlength); Index: htsearch/Collection.h --- htsearch/Collection.h.orig 2004-05-28 15:15:24 +0200 +++ htsearch/Collection.h 2006-06-17 22:28:24 +0200 @@ -36,9 +36,9 @@ const char *docExcerpt); ~Collection(); - void Collection::Open(); + void Open(); - void Collection::Close(); + void Close(); char *getWordFile() { return wordFile.get(); } DocumentRef *getDocumentRef(int id); Index: htsearch/Display.cc --- htsearch/Display.cc.orig 2004-05-28 15:15:24 +0200 +++ htsearch/Display.cc 2006-06-17 22:25:21 +0200 @@ -362,7 +362,7 @@ if (maxScore != 0 && maxScore != minScore) { - int percent = (int)((ref->DocScore() - minScore) * 100 / + int percent = (int)((ref->DocScore() - minScore) * 100.0 / (maxScore - minScore)); if (percent <= 0) percent = 1; @@ -694,6 +694,38 @@ // if (nPages > 1) { + // Assume number of page links is equal to maximum_page_buttons + // For example, if pageNumber=9, maximum_page_buttons=10, + // and nPages>=13, we get: + // + // [prev] 4 5 6 7 8 9 10 11 12 13 [next] + + int nPageButtons = config->Value("maximum_page_buttons", 10); + + // Initialize indexes of pages links + int first_page_index = 1; + int last_page_index = nPages; + + if (nPages > nPageButtons) + { + // Try to center the current page + int links_on_the_left = nPageButtons/2; + first_page_index = pageNumber - links_on_the_left; + last_page_index = first_page_index + nPageButtons - 1; + + // Adjust if required + if (first_page_index < 1) + { + first_page_index = 1; + last_page_index = nPageButtons; + } + else if (last_page_index > nPages ) + { + last_page_index = nPages; + first_page_index = nPages - nPageButtons + 1; + } + } + if (pageNumber > 1) { str = new String("Find("page_number_text"), " \t\r\n"); QuotedStringList npnt(config->Find("no_page_number_text"), " \t\r\n"); QuotedStringList sep(config->Find("page_number_separator"), " \t\r\n"); - if (nPages > config->Value("maximum_page_buttons", 10)) - nPages = config->Value("maximum_page_buttons", 10); - for (i = 1; i <= nPages; i++) + + for (i = first_page_index; i <= last_page_index; i++) { if (i == pageNumber) { Index: httools/htmerge.cc --- httools/htmerge.cc.orig 2004-05-28 15:15:25 +0200 +++ httools/htmerge.cc 2006-06-17 22:25:21 +0200 @@ -191,6 +191,64 @@ return 0; } +// Declare a record for storing callback data +class CallbackData : public Object +{ +public: + CallbackData(HtWordList * w, Dictionary * d, int o) + { word_db = w; dup_ids = d; docIDOffset = o; } + + HtWordList * word_db; + Dictionary * dup_ids; + int docIDOffset; +}; + + +//***************************************************************************** +// int OverrideCallback(WordList * wl, WordDBCursor &, +// const WordReference * w, Object & d ) +// +int +OverrideCallback(WordList * wl, + WordDBCursor &, + const WordReference * w, + Object & d) +{ + CallbackData & data = ((CallbackData &)d); + HtWordReference * ht_wr = (HtWordReference *)w; + String docIDKey; + + docIDKey << ht_wr->DocID(); + if (!((data.dup_ids)->Exists(docIDKey))) + { + ht_wr->DocID(ht_wr->DocID() + data.docIDOffset); + (data.word_db)->Override(*ht_wr); + } + + return OK; +} + +//***************************************************************************** +// int DeleteCallback(WordList * wl, WordDBCursor &, +// const WordReference * w, Object & d ) +// +int +DeleteCallback(WordList * wl, + WordDBCursor &, + const WordReference * w, + Object & d) +{ + CallbackData & data = ((CallbackData &)d); + HtWordReference * ht_wr = (HtWordReference *)w; + String docIDKey; + + docIDKey << ht_wr->DocID(); + if ((data.dup_ids)->Exists(docIDKey)) + (data.word_db)->Delete(*ht_wr); + + return OK; +} + //***************************************************************************** // void mergeDB() // @@ -316,8 +374,6 @@ // OK, after merging the doc DBs, we do the same for the words HtWordList mergeWordDB(*config), wordDB(*config); - List *words; - String docIDKey; if (wordDB.Open(config->Find("word_db"), O_RDWR) < 0) { @@ -332,33 +388,24 @@ } // Start the merging by going through all the URLs that are in - // the database to be merged - - words = mergeWordDB.WordRefs(); + // the database to be merged + WordCursor *mergeCursor; + WordKey empty; - words->Start_Get(); - HtWordReference *word; - while ((word = (HtWordReference *) words->Get_Next())) { - docIDKey = word->DocID(); - if (merge_dup_ids.Exists(docIDKey)) - continue; - - word->DocID(word->DocID() + docIDOffset); - wordDB.Override(*word); + CallbackData data(&wordDB, &merge_dup_ids, docIDOffset); + mergeCursor = mergeWordDB.Cursor(empty, OverrideCallback, (Object *)&data); + mergeCursor->Walk(); + delete mergeCursor; } - delete words; - words = wordDB.WordRefs(); - words->Start_Get(); - while ((word = (HtWordReference *) words->Get_Next())) { - docIDKey = word->DocID(); - if (db_dup_ids.Exists(docIDKey)) - wordDB.Delete(*word); + CallbackData data(&wordDB, &db_dup_ids, 0); + mergeCursor = wordDB.Cursor(empty,DeleteCallback, (Object *)&data); + mergeCursor->Walk(); + delete mergeCursor; } - delete words; - + // Cleanup--just close the two word databases mergeWordDB.Close(); wordDB.Close(); Index: installdir/htdig.conf --- installdir/htdig.conf.orig 2004-02-08 11:19:33 +0100 +++ installdir/htdig.conf 2006-06-17 22:25:21 +0200 @@ -47,7 +47,7 @@ # long list of URLs, it may be wise to replace it with something like # http://www. or comment this out and use the compiled-in default. # -common_url_parts: ${limit_urls_to} .html .htm .shtml +common_url_parts: ${limit_urls_to} .html .htm .shtml .php # # If there are particular pages that you definitely do NOT want to index, you @@ -70,7 +70,7 @@ # actual strings. # bad_extensions: .wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif \ - .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css + .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css .js .png .ico # # The string htdig will send in every request to identify the robot. Change Index: installdir/rundig --- installdir/rundig.orig 2003-12-29 09:49:05 +0100 +++ installdir/rundig 2006-06-17 22:25:21 +0200 @@ -30,7 +30,6 @@ done # If -a specified, note the database directory to move the temp files correctly -# TODO: Should also check for files relative to COMMONDIR. if [ -f "$conffile" ] then new_db_dir=`awk '/^[^#a-zA-Z]*database_dir/ { print $NF }' < $conffile` @@ -38,6 +37,11 @@ then DBDIR=$new_db_dir fi + new_dir=`awk '/^[^#a-zA-Z]*common_dir/ { print $NF }' < $conffile` + if [ "$new_dir" != "" ] + then + COMMONDIR=$new_dir + fi else echo "Config file $conffile cannot be found" exit 1