You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

356 lines
10 KiB

Index: htdig/ExternalParser.cc
--- htdig/ExternalParser.cc.orig 2004-05-28 15:15:14 +0200
+++ htdig/ExternalParser.cc 2006-06-17 22:25:21 +0200
@@ -227,7 +227,7 @@
int get_file = (convertToType.length() != 0);
String newcontent;
- StringList cpargs(currentParser);
+ StringList cpargs(currentParser, " \t");
char **parsargs = new char * [cpargs.Count() + 5];
int argi;
for (argi = 0; argi < cpargs.Count(); argi++)
@@ -424,7 +424,7 @@
{
metadatetags = new StringMatch();
metadatetags->IgnoreCase();
- metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified");
+ metadatetags->Pattern("date|dc.date|dc.date.created|dc.date.modified");
}
// <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5>
Index: htnet/HtHTTP.cc
--- htnet/HtHTTP.cc.orig 2004-05-28 15:15:23 +0200
+++ htnet/HtHTTP.cc 2006-06-17 22:25:21 +0200
@@ -643,6 +643,8 @@
String line = 0;
int inHeader = 1;
+ _needUTF8Convert = 0;
+
if (_response._modification_time)
{
delete _response._modification_time;
@@ -731,7 +733,15 @@
token = strtok(token, "\n\t");
if (token && *token)
+ {
_response._content_type = token;
+ if ((_response._content_type.indexOf("text/html") != -1) && (_response._content_type.indexOf("UTF-8") != -1))
+ {
+ if ( debug > 4 )
+ cout << "needUTF8Convert flagged" << endl;
+ _needUTF8Convert = 1;
+ }
+ }
}
else if( ! mystrncasecmp((char*)line, "content-length:", 15))
@@ -970,6 +980,31 @@
}
+ if ( _needUTF8Convert )
+ {
+ if ( debug > 4 )
+ cout << "Converting UTF-8 characters" << endl;
+
+ char *srcPtr, *dstPtr;
+ srcPtr = dstPtr = _response._contents.get();
+ while ( *srcPtr )
+ {
+ if ( ( *srcPtr & 0x80 ) == 0 )
+ *dstPtr++ = *srcPtr++;
+ else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) {
+ *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F) ) & 0xFF;
+ srcPtr += 2;
+ } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) {
+ *dstPtr++ = '?';
+ srcPtr += 3;
+ } else {
+ *dstPtr++ = '?';
+ srcPtr += 4;
+ }
+ }
+ *dstPtr = 0;
+ }
+
// Set document length
_response._document_length = _response._contents.length();
Index: htnet/HtHTTP.h
--- htnet/HtHTTP.h.orig 2004-05-28 15:15:23 +0200
+++ htnet/HtHTTP.h 2006-06-17 22:25:21 +0200
@@ -316,6 +316,7 @@
int _bytes_read; // Bytes read
URL _url; // URL to retrieve
URL _referer; // Referring URL
+ int _needUTF8Convert; // Flag for simple UTF-8 convert
String _accept_language; // accept-language directive
Index: htnet/SSLConnection.cc
--- htnet/SSLConnection.cc.orig 2004-05-28 15:15:23 +0200
+++ htnet/SSLConnection.cc 2006-06-17 22:25:21 +0200
@@ -131,6 +131,7 @@
{
errno = 0;
+ if (!SSL_pending(ssl)) {
if (timeout_value > 0) {
FD_SET_T fds;
FD_ZERO(&fds);
@@ -144,6 +145,7 @@
if (selected <= 0)
need_io_stop++;
}
+ }
if (!need_io_stop)
count = SSL_read(ssl, buffer, maxlength);
Index: htsearch/Collection.h
--- htsearch/Collection.h.orig 2004-05-28 15:15:24 +0200
+++ htsearch/Collection.h 2006-06-17 22:28:24 +0200
@@ -36,9 +36,9 @@
const char *docExcerpt);
~Collection();
- void Collection::Open();
+ void Open();
- void Collection::Close();
+ void Close();
char *getWordFile() { return wordFile.get(); }
DocumentRef *getDocumentRef(int id);
Index: htsearch/Display.cc
--- htsearch/Display.cc.orig 2004-05-28 15:15:24 +0200
+++ htsearch/Display.cc 2006-06-17 22:25:21 +0200
@@ -362,7 +362,7 @@
if (maxScore != 0 && maxScore != minScore)
{
- int percent = (int)((ref->DocScore() - minScore) * 100 /
+ int percent = (int)((ref->DocScore() - minScore) * 100.0 /
(maxScore - minScore));
if (percent <= 0)
percent = 1;
@@ -694,6 +694,38 @@
//
if (nPages > 1)
{
+ // Assume number of page links is equal to maximum_page_buttons
+ // For example, if pageNumber=9, maximum_page_buttons=10,
+ // and nPages>=13, we get:
+ //
+ // [prev] 4 5 6 7 8 9 10 11 12 13 [next]
+
+ int nPageButtons = config->Value("maximum_page_buttons", 10);
+
+ // Initialize indexes of pages links
+ int first_page_index = 1;
+ int last_page_index = nPages;
+
+ if (nPages > nPageButtons)
+ {
+ // Try to center the current page
+ int links_on_the_left = nPageButtons/2;
+ first_page_index = pageNumber - links_on_the_left;
+ last_page_index = first_page_index + nPageButtons - 1;
+
+ // Adjust if required
+ if (first_page_index < 1)
+ {
+ first_page_index = 1;
+ last_page_index = nPageButtons;
+ }
+ else if (last_page_index > nPages )
+ {
+ last_page_index = nPages;
+ first_page_index = nPages - nPageButtons + 1;
+ }
+ }
+
if (pageNumber > 1)
{
str = new String("<a href=\"");
@@ -725,9 +757,8 @@
QuotedStringList pnt(config->Find("page_number_text"), " \t\r\n");
QuotedStringList npnt(config->Find("no_page_number_text"), " \t\r\n");
QuotedStringList sep(config->Find("page_number_separator"), " \t\r\n");
- if (nPages > config->Value("maximum_page_buttons", 10))
- nPages = config->Value("maximum_page_buttons", 10);
- for (i = 1; i <= nPages; i++)
+
+ for (i = first_page_index; i <= last_page_index; i++)
{
if (i == pageNumber)
{
Index: httools/htmerge.cc
--- httools/htmerge.cc.orig 2004-05-28 15:15:25 +0200
+++ httools/htmerge.cc 2006-06-17 22:25:21 +0200
@@ -191,6 +191,64 @@
return 0;
}
+// Declare a record for storing callback data
+class CallbackData : public Object
+{
+public:
+ CallbackData(HtWordList * w, Dictionary * d, int o)
+ { word_db = w; dup_ids = d; docIDOffset = o; }
+
+ HtWordList * word_db;
+ Dictionary * dup_ids;
+ int docIDOffset;
+};
+
+
+//*****************************************************************************
+// int OverrideCallback(WordList * wl, WordDBCursor &,
+// const WordReference * w, Object & d )
+//
+int
+OverrideCallback(WordList * wl,
+ WordDBCursor &,
+ const WordReference * w,
+ Object & d)
+{
+ CallbackData & data = ((CallbackData &)d);
+ HtWordReference * ht_wr = (HtWordReference *)w;
+ String docIDKey;
+
+ docIDKey << ht_wr->DocID();
+ if (!((data.dup_ids)->Exists(docIDKey)))
+ {
+ ht_wr->DocID(ht_wr->DocID() + data.docIDOffset);
+ (data.word_db)->Override(*ht_wr);
+ }
+
+ return OK;
+}
+
+//*****************************************************************************
+// int DeleteCallback(WordList * wl, WordDBCursor &,
+// const WordReference * w, Object & d )
+//
+int
+DeleteCallback(WordList * wl,
+ WordDBCursor &,
+ const WordReference * w,
+ Object & d)
+{
+ CallbackData & data = ((CallbackData &)d);
+ HtWordReference * ht_wr = (HtWordReference *)w;
+ String docIDKey;
+
+ docIDKey << ht_wr->DocID();
+ if ((data.dup_ids)->Exists(docIDKey))
+ (data.word_db)->Delete(*ht_wr);
+
+ return OK;
+}
+
//*****************************************************************************
// void mergeDB()
//
@@ -316,8 +374,6 @@
// OK, after merging the doc DBs, we do the same for the words
HtWordList mergeWordDB(*config), wordDB(*config);
- List *words;
- String docIDKey;
if (wordDB.Open(config->Find("word_db"), O_RDWR) < 0)
{
@@ -332,33 +388,24 @@
}
// Start the merging by going through all the URLs that are in
- // the database to be merged
-
- words = mergeWordDB.WordRefs();
+ // the database to be merged
+ WordCursor *mergeCursor;
+ WordKey empty;
- words->Start_Get();
- HtWordReference *word;
- while ((word = (HtWordReference *) words->Get_Next()))
{
- docIDKey = word->DocID();
- if (merge_dup_ids.Exists(docIDKey))
- continue;
-
- word->DocID(word->DocID() + docIDOffset);
- wordDB.Override(*word);
+ CallbackData data(&wordDB, &merge_dup_ids, docIDOffset);
+ mergeCursor = mergeWordDB.Cursor(empty, OverrideCallback, (Object *)&data);
+ mergeCursor->Walk();
+ delete mergeCursor;
}
- delete words;
- words = wordDB.WordRefs();
- words->Start_Get();
- while ((word = (HtWordReference *) words->Get_Next()))
{
- docIDKey = word->DocID();
- if (db_dup_ids.Exists(docIDKey))
- wordDB.Delete(*word);
+ CallbackData data(&wordDB, &db_dup_ids, 0);
+ mergeCursor = wordDB.Cursor(empty,DeleteCallback, (Object *)&data);
+ mergeCursor->Walk();
+ delete mergeCursor;
}
- delete words;
-
+
// Cleanup--just close the two word databases
mergeWordDB.Close();
wordDB.Close();
Index: installdir/htdig.conf
--- installdir/htdig.conf.orig 2004-02-08 11:19:33 +0100
+++ installdir/htdig.conf 2006-06-17 22:25:21 +0200
@@ -47,7 +47,7 @@
# long list of URLs, it may be wise to replace it with something like
# http://www. or comment this out and use the compiled-in default.
#
-common_url_parts: ${limit_urls_to} .html .htm .shtml
+common_url_parts: ${limit_urls_to} .html .htm .shtml .php
#
# If there are particular pages that you definitely do NOT want to index, you
@@ -70,7 +70,7 @@
# actual strings.
#
bad_extensions: .wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif \
- .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css
+ .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css .js .png .ico
#
# The string htdig will send in every request to identify the robot. Change
Index: installdir/rundig
--- installdir/rundig.orig 2003-12-29 09:49:05 +0100
+++ installdir/rundig 2006-06-17 22:25:21 +0200
@@ -30,7 +30,6 @@
done
# If -a specified, note the database directory to move the temp files correctly
-# TODO: Should also check for files relative to COMMONDIR.
if [ -f "$conffile" ]
then
new_db_dir=`awk '/^[^#a-zA-Z]*database_dir/ { print $NF }' < $conffile`
@@ -38,6 +37,11 @@
then
DBDIR=$new_db_dir
fi
+ new_dir=`awk '/^[^#a-zA-Z]*common_dir/ { print $NF }' < $conffile`
+ if [ "$new_dir" != "" ]
+ then
+ COMMONDIR=$new_dir
+ fi
else
echo "Config file $conffile cannot be found"
exit 1