You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
356 lines
10 KiB
356 lines
10 KiB
Index: htdig/ExternalParser.cc |
|
--- htdig/ExternalParser.cc.orig 2004-05-28 15:15:14 +0200 |
|
+++ htdig/ExternalParser.cc 2006-06-17 22:25:21 +0200 |
|
@@ -227,7 +227,7 @@ |
|
int get_file = (convertToType.length() != 0); |
|
String newcontent; |
|
|
|
- StringList cpargs(currentParser); |
|
+ StringList cpargs(currentParser, " \t"); |
|
char **parsargs = new char * [cpargs.Count() + 5]; |
|
int argi; |
|
for (argi = 0; argi < cpargs.Count(); argi++) |
|
@@ -424,7 +424,7 @@ |
|
{ |
|
metadatetags = new StringMatch(); |
|
metadatetags->IgnoreCase(); |
|
- metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified"); |
|
+ metadatetags->Pattern("date|dc.date|dc.date.created|dc.date.modified"); |
|
} |
|
|
|
// <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5> |
|
Index: htnet/HtHTTP.cc |
|
--- htnet/HtHTTP.cc.orig 2004-05-28 15:15:23 +0200 |
|
+++ htnet/HtHTTP.cc 2006-06-17 22:25:21 +0200 |
|
@@ -643,6 +643,8 @@ |
|
String line = 0; |
|
int inHeader = 1; |
|
|
|
+ _needUTF8Convert = 0; |
|
+ |
|
if (_response._modification_time) |
|
{ |
|
delete _response._modification_time; |
|
@@ -731,7 +733,15 @@ |
|
token = strtok(token, "\n\t"); |
|
|
|
if (token && *token) |
|
+ { |
|
_response._content_type = token; |
|
+ if ((_response._content_type.indexOf("text/html") != -1) && (_response._content_type.indexOf("UTF-8") != -1)) |
|
+ { |
|
+ if ( debug > 4 ) |
|
+ cout << "needUTF8Convert flagged" << endl; |
|
+ _needUTF8Convert = 1; |
|
+ } |
|
+ } |
|
|
|
} |
|
else if( ! mystrncasecmp((char*)line, "content-length:", 15)) |
|
@@ -970,6 +980,31 @@ |
|
|
|
} |
|
|
|
+ if ( _needUTF8Convert ) |
|
+ { |
|
+ if ( debug > 4 ) |
|
+ cout << "Converting UTF-8 characters" << endl; |
|
+ |
|
+ char *srcPtr, *dstPtr; |
|
+ srcPtr = dstPtr = _response._contents.get(); |
|
+ while ( *srcPtr ) |
|
+ { |
|
+ if ( ( *srcPtr & 0x80 ) == 0 ) |
|
+ *dstPtr++ = *srcPtr++; |
|
+ else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) { |
|
+ *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F) ) & 0xFF; |
|
+ srcPtr += 2; |
|
+ } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) { |
|
+ *dstPtr++ = '?'; |
|
+ srcPtr += 3; |
|
+ } else { |
|
+ *dstPtr++ = '?'; |
|
+ srcPtr += 4; |
|
+ } |
|
+ } |
|
+ *dstPtr = 0; |
|
+ } |
|
+ |
|
// Set document length |
|
_response._document_length = _response._contents.length(); |
|
|
|
Index: htnet/HtHTTP.h |
|
--- htnet/HtHTTP.h.orig 2004-05-28 15:15:23 +0200 |
|
+++ htnet/HtHTTP.h 2006-06-17 22:25:21 +0200 |
|
@@ -316,6 +316,7 @@ |
|
int _bytes_read; // Bytes read |
|
URL _url; // URL to retrieve |
|
URL _referer; // Referring URL |
|
+ int _needUTF8Convert; // Flag for simple UTF-8 convert |
|
|
|
String _accept_language; // accept-language directive |
|
|
|
Index: htnet/SSLConnection.cc |
|
--- htnet/SSLConnection.cc.orig 2004-05-28 15:15:23 +0200 |
|
+++ htnet/SSLConnection.cc 2006-06-17 22:25:21 +0200 |
|
@@ -131,6 +131,7 @@ |
|
{ |
|
errno = 0; |
|
|
|
+ if (!SSL_pending(ssl)) { |
|
if (timeout_value > 0) { |
|
FD_SET_T fds; |
|
FD_ZERO(&fds); |
|
@@ -144,6 +145,7 @@ |
|
if (selected <= 0) |
|
need_io_stop++; |
|
} |
|
+ } |
|
|
|
if (!need_io_stop) |
|
count = SSL_read(ssl, buffer, maxlength); |
|
Index: htsearch/Collection.h |
|
--- htsearch/Collection.h.orig 2004-05-28 15:15:24 +0200 |
|
+++ htsearch/Collection.h 2006-06-17 22:28:24 +0200 |
|
@@ -36,9 +36,9 @@ |
|
const char *docExcerpt); |
|
~Collection(); |
|
|
|
- void Collection::Open(); |
|
+ void Open(); |
|
|
|
- void Collection::Close(); |
|
+ void Close(); |
|
|
|
char *getWordFile() { return wordFile.get(); } |
|
DocumentRef *getDocumentRef(int id); |
|
Index: htsearch/Display.cc |
|
--- htsearch/Display.cc.orig 2004-05-28 15:15:24 +0200 |
|
+++ htsearch/Display.cc 2006-06-17 22:25:21 +0200 |
|
@@ -362,7 +362,7 @@ |
|
|
|
if (maxScore != 0 && maxScore != minScore) |
|
{ |
|
- int percent = (int)((ref->DocScore() - minScore) * 100 / |
|
+ int percent = (int)((ref->DocScore() - minScore) * 100.0 / |
|
(maxScore - minScore)); |
|
if (percent <= 0) |
|
percent = 1; |
|
@@ -694,6 +694,38 @@ |
|
// |
|
if (nPages > 1) |
|
{ |
|
+ // Assume number of page links is equal to maximum_page_buttons |
|
+ // For example, if pageNumber=9, maximum_page_buttons=10, |
|
+ // and nPages>=13, we get: |
|
+ // |
|
+ // [prev] 4 5 6 7 8 9 10 11 12 13 [next] |
|
+ |
|
+ int nPageButtons = config->Value("maximum_page_buttons", 10); |
|
+ |
|
+ // Initialize indexes of pages links |
|
+ int first_page_index = 1; |
|
+ int last_page_index = nPages; |
|
+ |
|
+ if (nPages > nPageButtons) |
|
+ { |
|
+ // Try to center the current page |
|
+ int links_on_the_left = nPageButtons/2; |
|
+ first_page_index = pageNumber - links_on_the_left; |
|
+ last_page_index = first_page_index + nPageButtons - 1; |
|
+ |
|
+ // Adjust if required |
|
+ if (first_page_index < 1) |
|
+ { |
|
+ first_page_index = 1; |
|
+ last_page_index = nPageButtons; |
|
+ } |
|
+ else if (last_page_index > nPages ) |
|
+ { |
|
+ last_page_index = nPages; |
|
+ first_page_index = nPages - nPageButtons + 1; |
|
+ } |
|
+ } |
|
+ |
|
if (pageNumber > 1) |
|
{ |
|
str = new String("<a href=\""); |
|
@@ -725,9 +757,8 @@ |
|
QuotedStringList pnt(config->Find("page_number_text"), " \t\r\n"); |
|
QuotedStringList npnt(config->Find("no_page_number_text"), " \t\r\n"); |
|
QuotedStringList sep(config->Find("page_number_separator"), " \t\r\n"); |
|
- if (nPages > config->Value("maximum_page_buttons", 10)) |
|
- nPages = config->Value("maximum_page_buttons", 10); |
|
- for (i = 1; i <= nPages; i++) |
|
+ |
|
+ for (i = first_page_index; i <= last_page_index; i++) |
|
{ |
|
if (i == pageNumber) |
|
{ |
|
Index: httools/htmerge.cc |
|
--- httools/htmerge.cc.orig 2004-05-28 15:15:25 +0200 |
|
+++ httools/htmerge.cc 2006-06-17 22:25:21 +0200 |
|
@@ -191,6 +191,64 @@ |
|
return 0; |
|
} |
|
|
|
+// Declare a record for storing callback data |
|
+class CallbackData : public Object |
|
+{ |
|
+public: |
|
+ CallbackData(HtWordList * w, Dictionary * d, int o) |
|
+ { word_db = w; dup_ids = d; docIDOffset = o; } |
|
+ |
|
+ HtWordList * word_db; |
|
+ Dictionary * dup_ids; |
|
+ int docIDOffset; |
|
+}; |
|
+ |
|
+ |
|
+//***************************************************************************** |
|
+// int OverrideCallback(WordList * wl, WordDBCursor &, |
|
+// const WordReference * w, Object & d ) |
|
+// |
|
+int |
|
+OverrideCallback(WordList * wl, |
|
+ WordDBCursor &, |
|
+ const WordReference * w, |
|
+ Object & d) |
|
+{ |
|
+ CallbackData & data = ((CallbackData &)d); |
|
+ HtWordReference * ht_wr = (HtWordReference *)w; |
|
+ String docIDKey; |
|
+ |
|
+ docIDKey << ht_wr->DocID(); |
|
+ if (!((data.dup_ids)->Exists(docIDKey))) |
|
+ { |
|
+ ht_wr->DocID(ht_wr->DocID() + data.docIDOffset); |
|
+ (data.word_db)->Override(*ht_wr); |
|
+ } |
|
+ |
|
+ return OK; |
|
+} |
|
+ |
|
+//***************************************************************************** |
|
+// int DeleteCallback(WordList * wl, WordDBCursor &, |
|
+// const WordReference * w, Object & d ) |
|
+// |
|
+int |
|
+DeleteCallback(WordList * wl, |
|
+ WordDBCursor &, |
|
+ const WordReference * w, |
|
+ Object & d) |
|
+{ |
|
+ CallbackData & data = ((CallbackData &)d); |
|
+ HtWordReference * ht_wr = (HtWordReference *)w; |
|
+ String docIDKey; |
|
+ |
|
+ docIDKey << ht_wr->DocID(); |
|
+ if ((data.dup_ids)->Exists(docIDKey)) |
|
+ (data.word_db)->Delete(*ht_wr); |
|
+ |
|
+ return OK; |
|
+} |
|
+ |
|
//***************************************************************************** |
|
// void mergeDB() |
|
// |
|
@@ -316,8 +374,6 @@ |
|
|
|
// OK, after merging the doc DBs, we do the same for the words |
|
HtWordList mergeWordDB(*config), wordDB(*config); |
|
- List *words; |
|
- String docIDKey; |
|
|
|
if (wordDB.Open(config->Find("word_db"), O_RDWR) < 0) |
|
{ |
|
@@ -332,33 +388,24 @@ |
|
} |
|
|
|
// Start the merging by going through all the URLs that are in |
|
- // the database to be merged |
|
- |
|
- words = mergeWordDB.WordRefs(); |
|
+ // the database to be merged |
|
+ WordCursor *mergeCursor; |
|
+ WordKey empty; |
|
|
|
- words->Start_Get(); |
|
- HtWordReference *word; |
|
- while ((word = (HtWordReference *) words->Get_Next())) |
|
{ |
|
- docIDKey = word->DocID(); |
|
- if (merge_dup_ids.Exists(docIDKey)) |
|
- continue; |
|
- |
|
- word->DocID(word->DocID() + docIDOffset); |
|
- wordDB.Override(*word); |
|
+ CallbackData data(&wordDB, &merge_dup_ids, docIDOffset); |
|
+ mergeCursor = mergeWordDB.Cursor(empty, OverrideCallback, (Object *)&data); |
|
+ mergeCursor->Walk(); |
|
+ delete mergeCursor; |
|
} |
|
- delete words; |
|
|
|
- words = wordDB.WordRefs(); |
|
- words->Start_Get(); |
|
- while ((word = (HtWordReference *) words->Get_Next())) |
|
{ |
|
- docIDKey = word->DocID(); |
|
- if (db_dup_ids.Exists(docIDKey)) |
|
- wordDB.Delete(*word); |
|
+ CallbackData data(&wordDB, &db_dup_ids, 0); |
|
+ mergeCursor = wordDB.Cursor(empty,DeleteCallback, (Object *)&data); |
|
+ mergeCursor->Walk(); |
|
+ delete mergeCursor; |
|
} |
|
- delete words; |
|
- |
|
+ |
|
// Cleanup--just close the two word databases |
|
mergeWordDB.Close(); |
|
wordDB.Close(); |
|
Index: installdir/htdig.conf |
|
--- installdir/htdig.conf.orig 2004-02-08 11:19:33 +0100 |
|
+++ installdir/htdig.conf 2006-06-17 22:25:21 +0200 |
|
@@ -47,7 +47,7 @@ |
|
# long list of URLs, it may be wise to replace it with something like |
|
# http://www. or comment this out and use the compiled-in default. |
|
# |
|
-common_url_parts: ${limit_urls_to} .html .htm .shtml |
|
+common_url_parts: ${limit_urls_to} .html .htm .shtml .php |
|
|
|
# |
|
# If there are particular pages that you definitely do NOT want to index, you |
|
@@ -70,7 +70,7 @@ |
|
# actual strings. |
|
# |
|
bad_extensions: .wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif \ |
|
- .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css |
|
+ .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css .js .png .ico |
|
|
|
# |
|
# The string htdig will send in every request to identify the robot. Change |
|
Index: installdir/rundig |
|
--- installdir/rundig.orig 2003-12-29 09:49:05 +0100 |
|
+++ installdir/rundig 2006-06-17 22:25:21 +0200 |
|
@@ -30,7 +30,6 @@ |
|
done |
|
|
|
# If -a specified, note the database directory to move the temp files correctly |
|
-# TODO: Should also check for files relative to COMMONDIR. |
|
if [ -f "$conffile" ] |
|
then |
|
new_db_dir=`awk '/^[^#a-zA-Z]*database_dir/ { print $NF }' < $conffile` |
|
@@ -38,6 +37,11 @@ |
|
then |
|
DBDIR=$new_db_dir |
|
fi |
|
+ new_dir=`awk '/^[^#a-zA-Z]*common_dir/ { print $NF }' < $conffile` |
|
+ if [ "$new_dir" != "" ] |
|
+ then |
|
+ COMMONDIR=$new_dir |
|
+ fi |
|
else |
|
echo "Config file $conffile cannot be found" |
|
exit 1
|
|
|