#include "gb-include.h" #include "Parms.h" #include "File.h" #include "Conf.h" //#include "CollectionRec.h" #include "TcpSocket.h" #include "HttpRequest.h" #include "Pages.h" // g_pages #include "Tagdb.h" // g_tagdb #include "Catdb.h" #include "Collectiondb.h" #include "HttpMime.h" // atotime() //#include "Msg28.h" //#include "Sync.h" #include "Indexdb.h" // for MIN_TRUNC #include "SearchInput.h" #include "Unicode.h" #include "Threads.h" #include "Spider.h" // MAX_SPIDER_PRIORITIES #include "Statsdb.h" #include "Sections.h" #include "Msg17.h" #include "Process.h" #include "Repair.h" #include "Ads.h" #include "LanguagePages.h" #include "PingServer.h" #include "Users.h" #include "Proxy.h" #include "hash.h" #include "Test.h" #include "Rebalance.h" #include "SpiderProxy.h" // buildProxyTable() #include "PageInject.h" // InjectionRequest // width of input box in characters for url filter expression #define REGEX_TXT_MAX 80 Parms g_parms; //#include "Tfndb.h" #include "Spider.h" #include "Tagdb.h" #include "Indexdb.h" #include "Datedb.h" //#include "Checksumdb.h" #include "Clusterdb.h" #include "Collectiondb.h" // // new functions to extricate info from parm recs // int32_t getDataSizeFromParmRec ( char *rec ) { return *(int32_t *)(rec+sizeof(key96_t)); } char *getDataFromParmRec ( char *rec ) { return rec+sizeof(key96_t)+4; } collnum_t getCollnumFromParmRec ( char *rec ) { key96_t *k = (key96_t *)rec; return (collnum_t)k->n1; } // for parms that are arrays... int16_t getOccNumFromParmRec ( char *rec ) { key96_t *k = (key96_t *)rec; return (int16_t)((k->n0>>16)); } Parm *getParmFromParmRec ( char *rec ) { key96_t *k = (key96_t *)rec; int32_t cgiHash32 = (k->n0 >> 32); return g_parms.getParmFast2 ( cgiHash32 ); } int32_t getHashFromParmRec ( char *rec ) { key96_t *k = (key96_t *)rec; int32_t cgiHash32 = (k->n0 >> 32); return cgiHash32; } // . occNum is index # for parms that are arrays. it is -1 if not used. // . collnum is -1 for g_conf, which is not a collrec // . occNUm is -1 for a non-array parm key96_t makeParmKey ( collnum_t collnum , Parm *m , int16_t occNum ) { key96_t k; k.n1 = collnum; k.n0 = (uint32_t)m->m_cgiHash; // 32 bit k.n0 <<= 16; k.n0 |= (uint16_t)occNum; // blanks k.n0 <<= 16; // delbit. 1 means positive key k.n0 |= 0x01; // test if ( getCollnumFromParmRec ((char *)&k)!=collnum){char*xx=NULL;*xx=0;} if ( getOccNumFromParmRec ((char *)&k)!=occNum){char*xx=NULL;*xx=0;} return k; } bool printUrlExpressionExamples ( SafeBuf *sb ) ; ////////////////////////////////////////////// // // Command Functions. All return false if block... yadayada // ////////////////////////////////////////////// //////// // // . do commands this way now // . when handleRequest4 receives a special "command" parmdb rec // it calls executes the cmd, one of the functions listed below // . all these Command*() functions are called in updateParm() below // . they return false if they would block and they'll call your callback // specified in you "we" the WaitEntry // . they return true with g_errno set on error, set to 0 on success // //////// // from PageBasic.cpp: bool updateSiteListBuf(collnum_t collnum,bool addSeeds,char *siteListArg); bool CommandUpdateSiteList ( char *rec ) { // caller must specify collnum collnum_t collnum = getCollnumFromParmRec ( rec ); if ( collnum < 0 ) { log("parms: bad collnum for update site list"); g_errno = ENOCOLLREC; return true; } // sanity int32_t dataSize = getDataSizeFromParmRec ( rec ); if ( dataSize < 0 ) { log("parms: bad site list size = %"INT32" bad!",dataSize); g_errno = EBADENGINEER; return true; } // need this CollectionRec *cr = g_collectiondb.getRec ( collnum ); if ( ! cr ) { log("parms: no cr for collnum %"INT32" to update",(int32_t)collnum); return true; } // get the sitelist char *data = getDataFromParmRec ( rec ); // update the table that maps site to whether we should spider it // and also add newly introduced sites in "data" into spiderdb. updateSiteListBuf ( collnum , true , // add NEW seeds? data // entire sitelist ); // now that we deduped the old site list with the new one for // purposes of adding NEW seeds, we can do the final copy cr->m_siteListBuf.set ( data ); return true; } // . require user manually execute this to prevent us fucking up the data // at first initially because of a bad hosts.conf file!!! // . maybe put a red 'A' in the hosts table on the web page to indicate // we detected records that don't belong to our shard so user knows to // rebalance? // . we'll show it in a special msg box on all admin pages if required bool CommandRebalance ( char *rec ) { g_rebalance.m_userApproved = true; // force this to on so it goes through g_rebalance.m_numForeignRecs = 1; g_rebalance.m_needsRebalanceValid = false; return true; } bool CommandInsertUrlFiltersRow ( char *rec ) { // caller must specify collnum collnum_t collnum = getCollnumFromParmRec ( rec ); if ( collnum < 0 ) { log("parms: bad collnum for insert row"); g_errno = ENOCOLLREC; return true; } // sanity int32_t dataSize = getDataSizeFromParmRec ( rec ); if ( dataSize <= 1 ) { log("parms: insert row data size = %"INT32" bad!",dataSize); g_errno = EBADENGINEER; return true; } // need this CollectionRec *cr = g_collectiondb.getRec ( collnum ); // get the row # char *data = getDataFromParmRec ( rec ); int32_t rowNum = atol(data);//*(int32_t *)data; // scan all parms for url filter parms for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) { Parm *m = &g_parms.m_parms[i]; // parm must be a url filters parm if ( m->m_page != PAGE_FILTERS ) continue; // must be an array! if ( ! m->isArray() ) continue; // sanity check if ( m->m_obj != OBJ_COLL ) { char *xx=NULL;*xx=0; } // . add that row // . returns false and sets g_errno on error if ( ! g_parms.insertParm ( i, rowNum,(char *)cr)) return true; } return true; } bool CommandRemoveConnectIpRow ( char *rec ) { // caller must specify collnum //collnum_t collnum = getCollnumFromParmRec ( rec ); //if ( collnum < 0 ) { // g_errno = ENOCOLLREC; // log("parms: bad collnum for remove row"); // return true; //} // sanity int32_t dataSize = getDataSizeFromParmRec ( rec ); if ( dataSize <= 1 ) { log("parms: insert row data size = %"INT32" bad!",dataSize); g_errno = EBADENGINEER; return true; } // need this //CollectionRec *cr = g_collectiondb.getRec ( collnum ); // get the row # char *data = getDataFromParmRec ( rec ); int32_t rowNum = atol(data); // scan all parms for url filter parms for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) { Parm *m = &g_parms.m_parms[i]; // parm must be a url filters parm if ( m->m_page != PAGE_MASTERPASSWORDS ) continue; // must be an array! if ( ! m->isArray() ) continue; // sanity check if ( m->m_obj != OBJ_CONF ) { char *xx=NULL;*xx=0; } // must be masterip if ( m->m_type != TYPE_IP ) continue; // . nuke that parm's element // . returns false and sets g_errno on error if (!g_parms.removeParm(i,rowNum,(char *)&g_conf))return true; } return true; } bool CommandRemovePasswordRow ( char *rec ) { // sanity int32_t dataSize = getDataSizeFromParmRec ( rec ); if ( dataSize <= 1 ) { log("parms: insert row data size = %"INT32" bad!",dataSize); g_errno = EBADENGINEER; return true; } // get the row # char *data = getDataFromParmRec ( rec ); int32_t rowNum = atol(data); // scan all parms for url filter parms for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) { Parm *m = &g_parms.m_parms[i]; // parm must be a url filters parm if ( m->m_page != PAGE_MASTERPASSWORDS ) continue; // must be an array! if ( ! m->isArray() ) continue; // sanity check if ( m->m_obj != OBJ_CONF ) { char *xx=NULL;*xx=0; } // must be master password if ( m->m_type != TYPE_STRINGNONEMPTY ) continue; // . nuke that parm's element // . returns false and sets g_errno on error if (!g_parms.removeParm(i,rowNum,(char *)&g_conf))return true; } return true; } bool CommandRemoveUrlFiltersRow ( char *rec ) { // caller must specify collnum collnum_t collnum = getCollnumFromParmRec ( rec ); if ( collnum < 0 ) { g_errno = ENOCOLLREC; log("parms: bad collnum for remove row"); return true; } // sanity int32_t dataSize = getDataSizeFromParmRec ( rec ); if ( dataSize <= 1 ) { log("parms: insert row data size = %"INT32" bad!",dataSize); g_errno = EBADENGINEER; return true; } // need this CollectionRec *cr = g_collectiondb.getRec ( collnum ); // get the row # char *data = getDataFromParmRec ( rec ); int32_t rowNum = atol(data); // scan all parms for url filter parms for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) { Parm *m = &g_parms.m_parms[i]; // parm must be a url filters parm if ( m->m_page != PAGE_FILTERS ) continue; // must be an array! if ( ! m->isArray() ) continue; // sanity check if ( m->m_obj != OBJ_COLL ) { char *xx=NULL;*xx=0; } // . nuke that parm's element // . returns false and sets g_errno on error if ( ! g_parms.removeParm ( i,rowNum,(char *)cr)) return true; } return true; } // after we add a new coll, or at anytime after we can clone it bool CommandCloneColl ( char *rec ) { // the collnum we want to affect. collnum_t dstCollnum = getCollnumFromParmRec ( rec ); // . data is the collnum in ascii. // . from "&restart=467" for example char *data = rec + sizeof(key96_t) + 4; int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t)); //if ( dataSize < 1 ) { char *xx=NULL;*xx=0; } // copy parm settings from this collection name char *srcColl = data; // return if none to clone from if ( dataSize <= 0 ) return true; // avoid defaulting to main collection if ( ! data[0] ) return true; CollectionRec *srcRec = NULL; CollectionRec *dstRec = NULL; srcRec = g_collectiondb.getRec ( srcColl ); // get from name dstRec = g_collectiondb.getRec ( dstCollnum ); // get from # if ( ! srcRec ) return log("parms: invalid coll %s to clone from", srcColl); if ( ! dstRec ) return log("parms: invalid collnum %"INT32" to clone to", (int32_t)dstCollnum); log ("parms: cloning parms from collection %s to %s", srcRec->m_coll,dstRec->m_coll); g_parms.cloneCollRec ( (char *)dstRec , (char *)srcRec ); return true; } // customCrawl: // 0 for regular collection // 1 for custom crawl // 2 for bulk job // . returns false if blocks true otherwise bool CommandAddColl ( char *rec , char customCrawl ) { // caller must specify collnum collnum_t newCollnum = getCollnumFromParmRec ( rec ); // sanity. if ( newCollnum < 0 ) { g_errno = ENOCOLLREC; log("parms: bad collnum for AddColl"); return true; } char *data = rec + sizeof(key96_t) + 4; int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t)); // collection name must be at least 2 bytes (includes \0) if ( dataSize <= 1 ) { char *xx=NULL;*xx=0; } // then collname, \0 terminated char *collName = data; if ( gbstrlen(collName) > MAX_COLL_LEN ) { log("crawlbot: collection name too long"); return true; } // if ( ! g_parms.m_inSyncWithHost0 ) { // log("parms: can not add coll #%i %s until in sync with host 0", // (int)newCollnum,collName); // g_errno = EBADENGINEER; // return true; // } // this saves it to disk! returns false and sets g_errno on error. if ( ! g_collectiondb.addNewColl ( collName, customCrawl , NULL , // copy from 0 , // copy from len true , // save? newCollnum ) ) // error! g_errno should be set return true; return true; } // all nodes are guaranteed to add the same collnum for the given name bool CommandAddColl0 ( char *rec ) { // regular collection return CommandAddColl ( rec , 0 ); } bool CommandAddColl1 ( char *rec ) { // custom crawl return CommandAddColl ( rec , 1 ); } bool CommandAddColl2 ( char *rec ) { // bulk job return CommandAddColl ( rec , 2 ); } bool CommandResetProxyTable ( char *rec ) { // from SpiderProxy.h return resetProxyStats(); } // . returns true and sets g_errno on error // . returns false if would block bool CommandDeleteColl ( char *rec , WaitEntry *we ) { collnum_t collnum = getCollnumFromParmRec ( rec ); // if ( ! g_parms.m_inSyncWithHost0 ) { // log("parms: can not del collnum %i until in sync with host 0", // (int)collnum); // g_errno = EBADENGINEER; // return true; // } // the delete might block because the tree is saving and we can't // remove our collnum recs from it while it is doing that if ( ! g_collectiondb.deleteRec2 ( collnum ) ) // we blocked, we->m_callback will be called when done return false; // delete is successful return true; } // . returns true and sets g_errno on error // . returns false if would block bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) { char *data = rec + sizeof(key96_t) + 4; char *coll = (char *)data; collnum_t collnum = g_collectiondb.getCollnum ( coll ); // if ( ! g_parms.m_inSyncWithHost0 ) { // log("parms: can not del collnum %i until in sync with host 0", // (int)collnum); // g_errno = EBADENGINEER; // return true; // } if ( collnum < 0 ) { g_errno = ENOCOLLREC; return true;; } // the delete might block because the tree is saving and we can't // remove our collnum recs from it while it is doing that if ( ! g_collectiondb.deleteRec2 ( collnum ) ) // we blocked, we->m_callback will be called when done return false; // delete is successful return true; } bool CommandForceNextSpiderRound ( char *rec ) { // caller must specify collnum collnum_t collnum = getCollnumFromParmRec ( rec ); // need this CollectionRec *cr = g_collectiondb.getRec ( collnum ); if ( ! cr ) { g_errno = ENOCOLLREC; log("parms: bad collnum %"INT32" for restart spider round", (int32_t)collnum); return true; } // seems like parmlist is an rdblist, so we have a key_t followed // by 4 bytes of datasize then the data... which is an ascii string // in our case... char *data = getDataFromParmRec ( rec ); uint32_t roundStartTime; int32_t newRoundNum; // see the HACK: in Parms::convertHttpRequestToParmList() where we // construct this data in response to a "roundStart" cmd. we used // sprintf() so it's natural to use sscanf() to parse it out. sscanf ( data , "%"UINT32",%"INT32"", &roundStartTime, &newRoundNum); cr->m_spiderRoundStartTime = roundStartTime; cr->m_spiderRoundNum = newRoundNum; // if we don't have this is prints out "skipping0 ... " for urls // we try to spider in Spider.cpp. cr->m_spiderStatus = SP_INPROGRESS; // reset the round counts. this will log a msg. resetting the // round counts will prevent maxToProcess/maxToCrawl from holding // us back... spiderRoundIncremented ( cr ); // yeah, if we don't nuke doledb then it doesn't work... cr->rebuildUrlFilters(); return true; } // . returns true and sets g_errno on error // . returns false if would block bool CommandRestartColl ( char *rec , WaitEntry *we ) { collnum_t newCollnum = getCollnumFromParmRec ( rec ); // . data is the collnum in ascii. // . from "&restart=467" for example char *data = rec + sizeof(key96_t) + 4; int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t)); if ( dataSize < 1 ) { char *xx=NULL;*xx=0; } collnum_t oldCollnum = atol(data); if ( oldCollnum < 0 || oldCollnum >= g_collectiondb.m_numRecs || ! g_collectiondb.m_recs[oldCollnum] ) { log("parms: invalid collnum %"INT32" to restart",(int32_t)oldCollnum); return true; } // this can block if tree is saving, it has to wait // for tree save to complete before removing old // collnum recs from tree if ( ! g_collectiondb.resetColl2 ( oldCollnum , newCollnum , false ) ) // purgeSeeds? // we blocked, we->m_callback will be called when done return false; // turn on spiders on new collrec. collname is same but collnum // will be different. CollectionRec *cr = g_collectiondb.getRec ( newCollnum ); // if reset from crawlbot api page then enable spiders // to avoid user confusion //if ( cr ) cr->m_spideringEnabled = 1; if ( ! cr ) return true; // // repopulate spiderdb with the same sites // char *oldSiteList = cr->m_siteListBuf.getBufStart(); // do not let it have the buf any more cr->m_siteListBuf.detachBuf(); // can't leave it NULL, safebuf parms do not like to be null cr->m_siteListBuf.nullTerm(); // re-add the buf so it re-seeds spiderdb. it will not dedup these // urls in "oldSiteList" with "m_siteListBuf" which is now empty. // "true" = addSeeds. updateSiteListBuf ( newCollnum , true , oldSiteList ); // now put it back if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList ); // all done return true; } // . returns true and sets g_errno on error // . returns false if would block bool CommandResetColl ( char *rec , WaitEntry *we ) { collnum_t newCollnum = getCollnumFromParmRec ( rec ); // . data is the collnum in ascii. // . from "&restart=467" for example char *data = rec + sizeof(key96_t) + 4; int32_t dataSize = *(int32_t *)(rec + sizeof(key96_t)); if ( dataSize < 1 ) { char *xx=NULL;*xx=0; } collnum_t oldCollnum = atol(data); if ( oldCollnum < 0 || oldCollnum >= g_collectiondb.m_numRecs || ! g_collectiondb.m_recs[oldCollnum] ) { log("parms: invalid collnum %"INT32" to reset",(int32_t)oldCollnum); return true; } // this will not go through if tree is saving, it has to wait // for tree save to complete before removing old // collnum recs from tree. so return false in that case so caller // will know to re-call later. if ( ! g_collectiondb.resetColl2 ( oldCollnum , newCollnum , true ) ) // purgeSeeds? // we blocked, we->m_callback will be called when done return false; // turn on spiders on new collrec. collname is same but collnum // will be different. CollectionRec *cr = g_collectiondb.getRec ( newCollnum ); if ( ! cr ) return true; // // repopulate spiderdb with the same sites // char *oldSiteList = cr->m_siteListBuf.getBufStart(); // do not let it have the buf any more cr->m_siteListBuf.detachBuf(); // can't leave it NULL, safebuf parms do not like to be null cr->m_siteListBuf.nullTerm(); // re-add the buf so it re-seeds spiderdb. it will not dedup these // urls in "oldSiteList" with "m_siteListBuf" which is now empty. // "true" = addSeeds. updateSiteListBuf ( newCollnum , true , oldSiteList ); // now put it back if ( oldSiteList ) cr->m_siteListBuf.safeStrcpy ( oldSiteList ); // turn spiders off //if ( cr ) cr->m_spideringEnabled = 0; return true; } bool CommandParserTestInit ( char *rec ) { // enable testing for all other hosts g_conf.m_testParserEnabled = 1; // reset all files g_test.removeFiles(); // turn spiders on globally g_conf.m_spideringEnabled = 1; //g_conf.m_webSpideringEnabled = 1; // turn on for test coll too CollectionRec *cr = g_collectiondb.getRec("qatest123"); // turn on spiders if ( cr ) cr->m_spideringEnabled = 1; // tell spider loop to update active list g_spiderLoop.m_activeListValid = false; // if we are not host 0, turn on spiders for testing if ( g_hostdb.m_myHost->m_hostId != 0 ) return true; // start the test loop to inject urls for parsing/spidering g_test.initTestRun(); // done return true; } bool CommandSpiderTestInit ( char *rec ) { // enable testing for all other hosts g_conf.m_testSpiderEnabled = 1; // reset all files g_test.removeFiles(); // turn spiders on globally g_conf.m_spideringEnabled = 1; //g_conf.m_webSpideringEnabled = 1; // turn on for test coll too CollectionRec *cr = g_collectiondb.getRec("qatest123"); // turn on spiders if ( cr ) cr->m_spideringEnabled = 1; // tell spider loop to update active list g_spiderLoop.m_activeListValid = false; // if we are not host 0, turn on spiders for testing if ( g_hostdb.m_myHost->m_hostId != 0 ) return true; // start the test loop to inject urls for parsing/spidering g_test.initTestRun(); // done return true; } bool CommandSpiderTestCont ( char *rec ) { // enable testing for all other hosts g_conf.m_testSpiderEnabled = 1; // turn spiders on globally g_conf.m_spideringEnabled = 1; //g_conf.m_webSpideringEnabled = 1; // turn on for test coll too CollectionRec *cr = g_collectiondb.getRec("qatest123"); // turn on spiders if ( cr ) cr->m_spideringEnabled = 1; // tell spider loop to update active list g_spiderLoop.m_activeListValid = false; // done return true; } // some of these can block a little. if threads are off, a lot! bool CommandMerge ( char *rec ) { forceMergeAll ( RDB_POSDB ,1); forceMergeAll ( RDB_TITLEDB ,1); forceMergeAll ( RDB_TAGDB ,1); forceMergeAll ( RDB_SPIDERDB ,1); forceMergeAll ( RDB_LINKDB ,1); // most of these are probably already in good shape //g_checksumdb.getRdb()->attemptMerge (1,true); // g_clusterdb.getRdb()->attemptMerge (1,true); // niceness, force? // g_tagdb.getRdb()->attemptMerge (1,true); // g_catdb.getRdb()->attemptMerge (1,true); // //g_tfndb.getRdb()->attemptMerge (1,true); // g_spiderdb.getRdb()->attemptMerge (1,true); // // these 2 will probably need the merge the most // g_indexdb.getRdb()->attemptMerge (1,true); // g_datedb.getRdb()->attemptMerge (1,true); // g_titledb.getRdb()->attemptMerge (1,true); // //g_sectiondb.getRdb()->attemptMerge (1,true); // g_statsdb.getRdb()->attemptMerge (1,true); // g_linkdb .getRdb()->attemptMerge (1,true); return true; } bool CommandMergePosdb ( char *rec ) { forceMergeAll ( RDB_POSDB ,1); // set this for each posdb base return true; } bool CommandMergeSectiondb ( char *rec ) { //g_sectiondb.getRdb()->attemptMerge (1,true); // nice , force return true; } bool CommandMergeTitledb ( char *rec ) { forceMergeAll ( RDB_TITLEDB ,1); //g_titledb.getRdb()->attemptMerge (1,true); return true; } bool CommandMergeSpiderdb ( char *rec ) { forceMergeAll ( RDB_SPIDERDB ,1); //g_spiderdb.getRdb()->attemptMerge (1,true); return true; } bool CommandDiskPageCacheOff ( char *rec ) { g_process.resetPageCaches(); return true; } bool CommandForceIt ( char *rec ) { g_conf.m_forceIt = true; return true; } bool CommandDiskDump ( char *rec ) { //g_checksumdb.getRdb()->dumpTree ( 1 ); // niceness g_clusterdb.getRdb()->dumpTree ( 1 ); g_tagdb.getRdb()->dumpTree ( 1 ); g_catdb.getRdb()->dumpTree ( 1 ); //g_tfndb.getRdb()->dumpTree ( 1 ); g_spiderdb.getRdb()->dumpTree ( 1 ); g_posdb.getRdb()->dumpTree ( 1 ); //g_datedb.getRdb()->dumpTree ( 1 ); g_titledb.getRdb()->dumpTree ( 1 ); //g_sectiondb.getRdb()->dumpTree ( 1 ); g_statsdb.getRdb()->dumpTree ( 1 ); g_linkdb.getRdb() ->dumpTree ( 1 ); g_errno = 0; return true; } bool CommandJustSave ( char *rec ) { // returns false if blocked, true otherwise g_process.save (); // always return true here return true; } bool CommandSaveAndExit ( char *rec ) { // return true if this blocks g_process.shutdown ( false , NULL , NULL ); return true; } bool CommandUrgentSaveAndExit ( char *rec ) { // "true" means urgent g_process.shutdown ( true ); return true; } bool CommandReloadLanguagePages ( char *rec ) { g_languagePages.reloadPages(); return true; } bool CommandClearKernelError ( char *rec ) { g_hostdb.m_myHost->m_pingInfo.m_kernelErrors = 0; return true; } bool CommandPowerNotice ( int32_t hasPower ) { //int32_t hasPower = r->getLong("haspower",-1); log("powermo: received haspower=%"INT32"",hasPower); if ( hasPower != 0 && hasPower != 1 ) return true; // did power state change? if not just return true if ( g_process.m_powerIsOn && hasPower ) return true; if ( ! g_process.m_powerIsOn && ! hasPower ) return true; if ( hasPower ) { log("powermo: power is regained"); g_process.m_powerIsOn = true; return true; } // if it was on and went off... // now it is off log("powermo: power was lost"); // . SpiderLoop.cpp will not launch any more spiders as // int32_t as the power is off // . autosave should kick in every 30 seconds g_process.m_powerIsOn = false; // note the autosave log("powermo: disabling spiders, suspending merges, disabling " "tree writes and saving."); // tell Process.cpp::save2() to save the blocking caches too! //g_process.m_pleaseSaveCaches = true; // . save everything now... this may block some when saving the // caches... then do not do ANY writes... // . RdbMerge suspends all merging if power is off // . Rdb.cpp does not allow any adds if power is off. it will // send back an ETRYAGAIN... // . if a tree is being dumped, this will keep re-calling // Process.cpp::save2() g_process.save(); // also send an email if we are host #0 if ( g_hostdb.m_myHost->m_hostId != 0 ) return true; if ( g_proxy.isProxy() ) return true; char tmp[128]; Host *h0 = g_hostdb.getHost ( 0 ); int32_t ip0 = 0; if ( h0 ) ip0 = h0->m_ip; sprintf(tmp,"%s: POWER IS OFF",iptoa(ip0)); g_pingServer.sendEmail ( NULL , // Host ptr tmp , // msg true , // sendToAdmin false , // oom? false , // kernel error? true , // parm change? // force it? even if disabled? false ); return true; } bool CommandPowerOnNotice ( char *rec ) { return CommandPowerNotice ( 1 ); } bool CommandPowerOffNotice ( char *rec ) { return CommandPowerNotice ( 0 ); } bool CommandInSync ( char *rec ) { g_parms.m_inSyncWithHost0 = true; return true; } ////////////////////// // // end new commands // ////////////////////// static bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t selet , bool includeMinusOne , bool includeMinusTwo ) ; extern bool closeAll ( void *state, void (* callback)(void *state) ); extern bool allExit ( ) ; /* class Checksum { public: Checksum() : m_sum1( 0xffff ), m_sum2( 0xffff ) {} void addIn( const uint16_t *data, size_t size, FILE *f = 0 ) { // if an odd len of data, add first byte, then do rest below if ( size % 2 != 0 ) { m_sum1 += (uint16_t)*(uint8_t *)data; m_sum2 += m_sum1; size--; data = (uint16_t *)(((uint8_t *)data)+1); } size_t len = size/2; while ( len ) { unsigned tlen = len; // . 360 is largest amnt of sums that can be performed // without overflow if ( len > 360 ) tlen = 360; len -= tlen; do { m_sum1 += *data++; m_sum2 += m_sum1; } while ( --tlen ); m_sum1 = (m_sum1 & 0xffff) + (m_sum1 >> 16); m_sum2 = (m_sum2 & 0xffff) + (m_sum2 >> 16); } } void addInStrings( const uint16_t *data, int32_t cnt, int32_t size ) { while ( cnt ) { const uint16_t *origData = data; int32_t len = gbstrlen((char *)data); // if an odd len of data, add first byte, // then do rest below if ( len % 2 != 0 ) { m_sum1 += (uint16_t)*(uint8_t *)data; m_sum2 += m_sum1; len--; data = (uint16_t *)(((uint8_t *)data)+1); } len /= 2; while ( len ) { unsigned tlen = len; // . 360 = largest amnt of sums that can be // performed without overflow if ( len > 360 ) tlen = 360; len -= tlen; do { m_sum1 += *data++; m_sum2 += m_sum1; } while ( --tlen ); m_sum1 = (m_sum1 & 0xffff) + (m_sum1 >> 16); m_sum2 = (m_sum2 & 0xffff) + (m_sum2 >> 16); } cnt--; data = (uint16_t *)((char *)origData + size); } } void finalize() { m_sum1 = (m_sum1 & 0xffff) + (m_sum1 >> 16); m_sum2 = (m_sum2 & 0xffff) + (m_sum2 >> 16); } uint32_t getSum() const { return ( m_sum2 << 16 | m_sum1 ); } private: uint32_t m_sum1; uint32_t m_sum2; }; */ Parms::Parms ( ) { m_isDefaultLoaded = false; m_inSyncWithHost0 = false; m_triedToSync = false; } void Parms::detachSafeBufs ( CollectionRec *cr ) { for ( int32_t i = 0 ; i < m_numParms ; i++ ) { Parm *m = &m_parms[i]; if ( m->m_type != TYPE_SAFEBUF ) continue; if ( m->m_obj != OBJ_COLL ) continue; if ( m->m_off < 0 ) continue; int32_t max = 1; // this will be zero if not an array. // otherwise it is the # of elements in the array if ( m->m_size > max ) max = m->m_size; // an array of safebufs? m->m_size will be > 1 then. for ( int32_t j = 0 ; j < max ; j++ ) { // get it SafeBuf *sb = (SafeBuf *)((char *)cr + m->m_off + j*sizeof(SafeBuf)); sb->detachBuf(); } } } /* uint32_t Parms::calcChecksum() { Checksum cs; for ( int32_t i = 0 ; i < m_numParms ; i++ ) { Parm *m = &m_parms[i]; if ( m->m_obj == OBJ_SI ) continue; if ( m->m_off < 0 ) continue; if ( m->m_type == TYPE_COMMENT ) continue; if ( m->m_type == TYPE_MONOD2 ) continue; if ( m->m_type == TYPE_MONOM2 ) continue; if ( m->m_type == TYPE_CMD ) continue; if ( m->m_type == TYPE_LONG_CONST ) continue; int32_t size = 0; if ( m->m_type == TYPE_CHECKBOX ) size = 1; if ( m->m_type == TYPE_CHAR ) size = 1; if ( m->m_type == TYPE_CHAR2 ) size = 1; if ( m->m_type == TYPE_BOOL ) size = 1; if ( m->m_type == TYPE_BOOL2 ) size = 1; if ( m->m_type == TYPE_PRIORITY ) size = 1; if ( m->m_type == TYPE_PRIORITY2 ) size = 1; //if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1; if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1; if ( m->m_type == TYPE_RETRIES ) size = 1; if ( m->m_type == TYPE_TIME ) size = 6; if ( m->m_type == TYPE_DATE2 ) size = 4; if ( m->m_type == TYPE_DATE ) size = 4; if ( m->m_type == TYPE_FLOAT ) size = 4; if ( m->m_type == TYPE_IP ) size = 4; if ( m->m_type == TYPE_RULESET ) size = 4; if ( m->m_type == TYPE_LONG ) size = 4; if ( m->m_type == TYPE_LONG_LONG ) size = 8; if ( m->m_type == TYPE_STRING ) size = m->m_size; if ( m->m_type == TYPE_STRINGBOX ) size = m->m_size; if ( m->m_type == TYPE_STRINGNONEMPTY ) size = m->m_size; if ( m->m_type == TYPE_SAFEBUF ) size = m->m_size; if ( m->m_type == TYPE_SITERULE ) size = 4; // if we have an array int32_t cnt = 1; if (m->m_fixed > 0) { size *= m->m_fixed; cnt = m->m_fixed; } else { size *= m->m_max; cnt = m->m_max; } uint16_t *p = NULL; if ( m->m_obj == OBJ_CONF ) { p = (uint16_t *)((char *)&g_conf + m->m_off); if (m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_STRINGNONEMPTY ) { cs.addInStrings( p, cnt, m->m_size ); } else if ( m->m_type == TYPE_SAFEBUF ) { uint16_t *p2; SafeBuf *sb2 = (SafeBuf *)p; p2 = (uint16_t *)sb2->getBufStart(); cs.addIn( p2 , sb2->length() ); } else { cs.addIn( p, size ); } } else if ( m->m_obj == OBJ_COLL ) { collnum_t j = g_collectiondb.getFirstCollnum (); while ( j >= 0 ) { CollectionRec *cr = g_collectiondb.getRec( j ); p = (uint16_t *)((char *)cr + m->m_off); if (m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_STRINGNONEMPTY ) { cs.addInStrings( p, cnt, m->m_size ); } else if ( m->m_type == TYPE_SAFEBUF ) { uint16_t *p2; SafeBuf *sb2 = (SafeBuf *)p; p2 = (uint16_t *)sb2->getBufStart(); cs.addIn( p2 , sb2->length() ); } else { cs.addIn( p, size ); } j = g_collectiondb.getNextCollnum ( j ); } } } cs.finalize(); return cs.getSum(); } */ // from Pages.cpp bool printApiForPage ( SafeBuf *sb , int32_t PAGENUM , CollectionRec *cr ) ; // returns false and sets g_errno on error bool Parms::setGigablastRequest ( TcpSocket *socket , HttpRequest *hrArg , GigablastRequest *gr ) { // get the page from the path... like /sockets --> PAGE_SOCKETS int32_t page = g_pages.getDynamicPageNumber ( hrArg ); // is it a collection? char *THIS = (char *)gr; // ensure valid if ( ! THIS ) { // it is null when no collection explicitly specified... log("admin: THIS is null for page %"INT32".",page); return false; } // just in case memset ( gr , 0 , sizeof(GigablastRequest) ); gr->m_socket = socket; // make a copy of the httprequest because the original is on the stack // in HttpServer::requestHandler() if ( ! gr->m_hr.copy ( hrArg ) ) { log("admin: failed to copy httprequest: %s", mstrerror(g_errno)); return false; } // use the one we copied which won't disappear/beFreed on us HttpRequest *hr = &gr->m_hr; // need this int32_t obj = OBJ_GBREQUEST; // // reset THIS to defaults. use NULL for cr since mostly for SearchInput // setToDefault ( THIS , obj , NULL); // map PAGE_ADDURL to PAGE_ADDURL2 so // /addurl is same as /admin/addurl as far as parms. if ( page == PAGE_ADDURL ) page = PAGE_ADDURL2; // loop through cgi parms for ( int32_t i = 0 ; i < hr->getNumFields() ; i++ ) { // get cgi parm name char *field = hr->getField ( i ); //int32_t flen = hr->getFieldLen ( i ); // find in parms list int32_t j; Parm *m; for ( j = 0 ; j < m_numParms ; j++ ) { // get it m = &m_parms[j]; // must be of this type if ( m->m_obj != obj ) continue; // page must match if ( m->m_page != page ) continue; // skip if no cgi parm, may not be configurable now if ( ! m->m_cgi ) continue; // otherwise, must match the cgi name exactly if ( strcmp ( field,m->m_cgi ) == 0 ) break; //if ( ! m->m_cgi2 ) continue; // alias check //if ( strcmp ( field,m->m_cgi2 ) == 0 ) break; //if ( ! m->m_cgi2 ) continue; // alias check //if ( strcmp ( field,m->m_cgi3 ) == 0 ) break; //if ( ! m->m_cgi3 ) continue; // alias check //if ( strcmp ( field,m->m_cgi4 ) == 0 ) break; } // bail if the cgi field is not in the parms list if ( j >= m_numParms ) { //log("parms: missing cgi parm %s",field); continue; } // value of cgi parm (null terminated) char *v = hr->getValue ( i ); // . skip if no value was provided // . unless it was a string! so we can make them empty. if ( v[0] == '\0' && m->m_type != TYPE_CHARPTR && m->m_type != TYPE_STRING && m->m_type != TYPE_STRINGBOX ) continue; // skip if offset is negative, that means none if ( m->m_off < 0 ) continue; // skip if no permission //if ( (m->m_perms & user) == 0 ) continue; // set it. now our TYPE_CHARPTR will just be set to it directly // to save memory... setParm ( (char *)THIS , m, j, 0, v, false,//not html enc false ); // true ); // need to save it //if ( THIS != (char *)&g_conf ) // ((CollectionRec *)THIS)->m_needsSave = true; } return true; } bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ); // . returns false if blocked, true otherwise // . sets g_errno on error // . must ultimately send reply back on "s" // . called by Pages.cpp's sendDynamicReply() when it calls pg->function() // which is called by HttpServer::sendReply(s,r) when it gets an http request bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r ) { char buf [ 128000 ]; SafeBuf stackBuf(buf,128000); SafeBuf *sb = &stackBuf; int32_t page = g_pages.getDynamicPageNumber ( r ); char format = r->getReplyFormat(); char guide = r->getLong("guide",0); bool isMasterAdmin = g_conf.isMasterAdmin ( s , r ); bool isCollAdmin = g_conf.isCollAdmin ( s , r ); if ( ! g_conf.m_allowCloudUsers && ! isMasterAdmin && ! isCollAdmin ) { char *msg = "NO PERMISSION"; return g_httpServer.sendDynamicPage (s, msg,gbstrlen(msg)); } // // CLOUD SEARCH ENGINE SUPPORT // char *action = r->getString("action",NULL); if ( page == PAGE_BASIC_SETTINGS && guide && // this is non-null if handling a submit request action && format == FORMAT_HTML ) { //return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS ); // just redirect to it char *coll = r->getString("c",NULL); if ( coll ) { sb->safePrintf("", coll); return g_httpServer.sendDynamicPage (s, sb->getBufStart(), sb->length()); } } // // some "generic" pages do additional processing on the provided input // so we need to call those functions here... // // if we were an addurl page.. //if ( page == PAGE_ADDURL2 ) { // // this returns false if blocked and it should re-call // // sendPageGeneric when completed // if ( ! processAddUrlRequest ( s , r ) ) // return false; //} char *bodyjs = NULL; if ( page == PAGE_BASIC_SETTINGS ) bodyjs =" onload=document.getElementById('tabox').focus();"; // print standard header if ( format != FORMAT_XML && format != FORMAT_JSON ) g_pages.printAdminTop ( sb , s , r , NULL , bodyjs ); // xml/json header char *res = NULL; if ( format == FORMAT_XML ) res = "\n" "\t0\n" "\tSuccess\n"; if ( format == FORMAT_JSON ) res = "{ \"response:\"{\n" "\t\"statusCode\":0,\n" "\t\"statusMsg\":\"Success\"\n"; if ( res ) sb->safeStrcpy ( res ); // do not show the parms and their current values unless showsettings=1 // was explicitly given for the xml/json feeds int32_t show = 1; if ( format != FORMAT_HTML ) show = r->getLong("show",0); if ( show ) printParmTable ( sb , s , r ); // xml/json tail if ( format == FORMAT_XML ) res = "\n"; if ( format == FORMAT_JSON ) res = "\t}\n}\n"; if ( res ) sb->safeStrcpy ( res ); bool POSTReply = g_pages.getPage ( page )->m_usePost; char *ct = "text/html"; if ( format == FORMAT_XML ) ct = "text/xml"; if ( format == FORMAT_JSON ) ct = "application/json"; return g_httpServer.sendDynamicPage ( s , sb->getBufStart() , sb->length() , -1 , POSTReply , ct , // contType -1 , // httpstatus NULL,//cookie , NULL );// charset } bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) { int32_t page = g_pages.getDynamicPageNumber ( r ); int32_t fromIp = s->m_ip; char format = r->getReplyFormat(); /* if ( format == FORMAT_HTML ) sb->safePrintf ( ""); */ if ( page == PAGE_COLLPASSWORDS2 ) page = PAGE_COLLPASSWORDS; // print the start of the table char *tt = "None"; if ( page == PAGE_LOG ) tt = "Log Controls"; if ( page == PAGE_MASTER ) tt = "Master Controls"; if ( page == PAGE_INJECT ) tt = "Inject Url"; if ( page == PAGE_MASTERPASSWORDS ) tt = "Master Passwords"; if ( page == PAGE_ADDURL2 ) tt = "Add Urls"; if ( page == PAGE_SPIDER ) tt = "Spider Controls"; if ( page == PAGE_SEARCH ) tt = "Search Controls"; if ( page == PAGE_ACCESS ) tt = "Access Controls"; if ( page == PAGE_FILTERS ) tt = "Url Filters"; if ( page == PAGE_BASIC_SETTINGS ) tt = "Settings"; if ( page == PAGE_COLLPASSWORDS ) tt = "Collection Passwords"; //if ( page == PAGE_SITES ) tt = "Site List"; //if ( page == PAGE_PRIORITIES ) tt = "Priority Controls"; //if ( page == PAGE_RULES ) tt = "Site Rules"; //if ( page == PAGE_SYNC ) tt = "Sync"; if ( page == PAGE_REPAIR ) tt = "Rebuild Controls"; //if ( page == PAGE_ADFEED ) tt = "Ad Feed Controls"; // special messages for spider controls char *e1 = ""; char *e2 = ""; if ( page == PAGE_SPIDER && ! g_conf.m_spideringEnabled ) e1 = "
" "Spidering is temporarily disabled in Master Controls." "\n"; if ( page == PAGE_SPIDER && ! g_conf.m_addUrlEnabled ) e2 = "
" "Add url is temporarily disabled in Master Controls." "\n"; if ( format == FORMAT_XML || format == FORMAT_JSON ) { char *coll = g_collectiondb.getDefaultColl(r); CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true); bool isMasterAdmin = g_conf.isMasterAdmin ( s , r ); bool isCollAdmin = g_conf.isCollAdmin ( s , r ); g_parms.printParms2 ( sb , page , cr , 1 , // int32_t nc , # cols? 1 , // int32_t pd , print desc? false , // isCrawlbot format , NULL , // TcpSocket *sock isMasterAdmin , isCollAdmin ); return true; } // . page repair (PageRepair.cpp) has a status table BEFORE the parms // iff we are doing a repair // . only one page for all collections, we have a parm that is // a comma-separated list of the collections to repair. leave blank // to repair all collections. if ( page == PAGE_REPAIR ) g_repair.printRepairStatus ( sb , fromIp ); // start the table sb->safePrintf( "\n" "" "%s%s\n", tt,e1,e2); //bool isCrawlbot = false; //if ( collOveride ) isCrawlbot = true; // print the table(s) of controls //p= g_parms.printParms (p, pend, page, user, THIS, coll, pwd, nc, pd); g_parms.printParms ( sb , s , r ); // end the table sb->safePrintf ( "
"// bgcolor=#%s>" ,TABLE_STYLE //,DARKER_BLUE //,DARK_BLUE ); /* take this out since we took out a ton of parms for simplicties sake if ( page != PAGE_FILTERS ) sb->safePrintf("
" "filter:
" ); */ sb->safePrintf(//"
" //"" "
" "%s" //"" "
" //"
" "
\n" ); // this must be outside of table, submit button follows sb->safePrintf ( "
\n" ); if ( page == PAGE_SPIDERPROXIES ) { // wrap up the form, print a submit button g_pages.printSubmit ( sb ); printSpiderProxyTable ( sb ); // do not print another submit button return true; } // url filter page has a test table if ( page == PAGE_FILTERS ) { // wrap up the form, print a submit button g_pages.printSubmit ( sb ); printUrlExpressionExamples ( sb ); } else if ( page == PAGE_BASIC_SETTINGS ) { // wrap up the form, print a submit button g_pages.printSubmit ( sb ); printSitePatternExamples ( sb , r ); } else if ( page == PAGE_SPIDER ) { // PAGE_SITES // wrap up the form, print a submit button g_pages.printSubmit ( sb ); printSitePatternExamples ( sb , r ); } else { // wrap up the form, print a submit button g_pages.printAdminBottom ( sb ); } // extra sync table /* if ( page == PAGE_SYNC ) { // a table that shows the progress of a sync process sb.safePrintf ( "
" "" //"\n" , DARK_BLUE); for ( int32_t i = RDB_START ; i < RDB_END ; i++ ) { Rdb *r = getRdbFromId ( i ); if ( ! r ) continue; float pd = g_sync.getPercentDone ( i ); sb.safePrintf ( "" "" "\n", r->m_dbname , pd ); } sb.safePrintf ( "
" "
" "
" //"" "Sync Progress" //"" "
%s%.1f%%
\n"); } */ // if just printing into a buffer, return now //if ( pageBuf ) return true; return true; } /* char *printDropDown ( int32_t n , char *p, char *pend, char *name, int32_t select, bool includeMinusOne , bool includeMinusTwo ) { // begin the drop down menu sprintf ( p , "" ); p += gbstrlen ( p ); return p; } bool printDiffbotDropDown ( SafeBuf *sb,char *name,char *THIS , SafeBuf *sx) { //CollectionRec *cr = (CollectionRec *)THIS; // . get the string we have selected // . the list of available strings to select is in // m_diffbotApiList for this collection, and that can // be changed by john to add custom diffbot api urls. // . should just be m_spiderDiffbotApiUrl[i] safebuf char *usingApi = sx->getBufStart(); if ( sx->length() == 0 ) usingApi = NULL; // now scan each item in the list. see the setting of // "m_def" for "diffbotApiList" below to see the // comma separated list of default strings. each item in // this list is of the format "|<urlPath>," //char *p = cr->m_diffbotApiList.getBufStart(); char *p = "None|none," "All|http://www.diffbot.com/api/analyze?mode=auto&fields=*," "Article (autodetect)|http://www.diffbot.com/api/analyze?mode=article&fields=*," "Article (force)|http://www.diffbot.com/api/article?fields=*," "Product (autodetect)|http://www.diffbot.com/api/analyze?mode=product&fields=*," "Product (force)|http://www.diffbot.com/v2/product?fields=*," "Image (autodetect)|http://www.diffbot.com/api/analyze?mode=image&fields=*," "Image (force)|http://www.diffbot.com/api/image?fields=*," "FrontPage (autodetect)|http://www.diffbot.com/api/analyze?mode=frontpage&fields=*," "FrontPage (force)|http://www.diffbot.com/api/frontpage?fields=*" ; // wtf? if ( ! p ) return true; // print out. cgi is "dapi%"INT32"". sb->safePrintf("<select name=%s>\n",name); // print "none" as the first option //char *sel = ""; //if ( ! usingApi ) sel = " selected"; //sb->safePrintf("<option value=\"\"%s>None</option>",sel); // the various "diffbot urls" are separated by commas for ( ; *p ; ) { // point to start of item name char *name = p; // p should now point to name of the item char *end1 = p; // point to start of url for that item for ( ; *end1 && *end1 != '|' ;end1++); // save that char *url = end1; if ( *url == '|' ) url++; // find end of url char *urlEnd = url; for ( ; *urlEnd && *urlEnd != ',' ; urlEnd++ ); // do we match it? char *sel = ""; if ( usingApi && strncmp(usingApi,url,urlEnd-url)== 0 ) sel = " selected"; if ( ! usingApi && urlEnd - url == 0 ) sel = " selected"; // advance p p = urlEnd; // skip over comma to get next one if ( *p == ',' ) p++; // use the hash as the identifier sb->safePrintf("<option value=\""); sb->safeMemcpy ( url, urlEnd - url ); sb->safePrintf("\"%s>",sel); // print item name sb->safeMemcpy ( name , end1 - name ); sb->safePrintf("</option>\n"); } sb->safePrintf("</select>"); return true; } */ bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select, bool includeMinusOne , bool includeMinusTwo ) { // begin the drop down menu sb->safePrintf ( "<select name=%s>", name ); char *s; int32_t i = -1; if ( includeMinusOne ) i = -1; // . by default, minus 2 includes minus 3, the new "FILTERED" priority // . it is link "BANNED" but does not mean the url is low quality necessarily if ( includeMinusTwo ) i = -3; // no more DELETE, etc. i = 0; if ( select < 0 ) select = 0; for ( ; i < n ; i++ ) { if ( i == select ) s = " selected"; else s = ""; if ( i == -3 ) sb->safePrintf ("<option value=%"INT32"%s>DELETE",i,s); else if ( i == -2 ) //sb->safePrintf ("<option value=%"INT32"%s>BANNED",i,s); continue; else if ( i == -1 ) //sb->safePrintf ("<option value=%"INT32"%s>undefined",i,s); continue; else sb->safePrintf ("<option value=%"INT32"%s>%"INT32"",i,s,i); } sb->safePrintf ( "</select>" ); return true; } class DropLangs { public: char *m_title; char *m_lang; char *m_tld; }; DropLangs g_drops[] = { {"custom",NULL,NULL}, {"web",NULL,NULL}, {"news",NULL,NULL}, {"english","en","com,us.gov,org"}, {"german","de","de"}, {"french","fr","fr"}, {"norweigian","nl","nl"}, {"spanish","es","es"}, {"italian","it","it"}, {"romantic","en,de,fr,nl,es,it","com,us.gov,org,de,fr,nl,es,it"} }; // "url filters profile" values. used to set default crawl rules // in Collectiondb.cpp's CollectionRec::setUrlFiltersToDefaults(). // for instance, UFP_NEWS spiders sites more frequently but less deep in // order to get "news" pages and articles bool printDropDownProfile ( SafeBuf* sb, char *name, CollectionRec *cr ) { sb->safePrintf ( "<select name=%s>", name ); // the type of url filters profiles //char *items[] = {"custom","web","news","chinese","shallow"}; int32_t nd = sizeof(g_drops)/sizeof(DropLangs); for ( int32_t i = 0 ; i < nd ; i++ ) { //if ( i == select ) s = " selected"; //else s = ""; char *x = cr->m_urlFiltersProfile.getBufStart(); char *s; if ( strcmp(g_drops[i].m_title, x) == 0 ) s = " selected"; else s = ""; sb->safePrintf ("<option value=%s%s>%s", g_drops[i].m_title, s, g_drops[i].m_title ); } sb->safePrintf ( "</select>"); return true; } /* char *printCheckBoxes ( int32_t n , char *p, char *pend, char *name, char *array){ for ( int32_t i = 0 ; i < n ; i++ ) { if ( i > 0 ) sprintf (p, "<input type=checkbox value=1 name=%s%"INT32"", name,i); else sprintf (p, "<input type=checkbox value=1 name=%s", name); p += gbstrlen ( p ); if ( array[i] ) { sprintf ( p , " checked"); p += gbstrlen ( p ); } sprintf ( p , ">%"INT32"  " , i ); p += gbstrlen ( p ); //if i is single digit, add another nbsp so that everything's //aligned if ( i < 10 ) sprintf(p,"  "); p +=gbstrlen(p); if ( i > 0 && (i+1) % 6 == 0 ) sprintf(p,"<br>\n"); p+=gbstrlen(p); } return p; } */ bool printCheckBoxes ( int32_t n , SafeBuf* sb, char *name, char *array){ for ( int32_t i = 0 ; i < n ; i++ ) { if ( i > 0 ) sb->safePrintf ("<input type=checkbox value=1 name=%s%"INT32"", name,i); else sb->safePrintf ("<input type=checkbox value=1 name=%s", name); if ( array[i] ) { sb->safePrintf ( " checked"); } sb->safePrintf ( ">%"INT32"  " , i ); //if i is single digit, add another nbsp so that everything's //aligned if ( i < 10 ) sb->safePrintf("  "); if ( i > 0 && (i+1) % 6 == 0 ) sb->safePrintf("<br>\n"); } return true; } bool Parms::printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r) { int32_t page = g_pages.getDynamicPageNumber ( r ); int32_t nc = r->getLong("nc",1); int32_t pd = r->getLong("pd",1); char *coll = g_collectiondb.getDefaultColl(r); CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true); bool isMasterAdmin = g_conf.isMasterAdmin ( s , r ); bool isCollAdmin = g_conf.isCollAdmin ( s , r ); //char *coll = r->getString ( "c" ); //if ( ! coll || ! coll[0] ) coll = "main"; //CollectionRec *cr = g_collectiondb.getRec ( coll ); // if "main" collection does not exist, try another //if ( ! cr ) cr = getCollRecFromHttpRequest ( r ); printParms2 ( sb, page, cr, nc, pd,0,0 , s,isMasterAdmin,isCollAdmin); return true; } static int32_t s_count = 0; bool Parms::printParms2 ( SafeBuf* sb , int32_t page , CollectionRec *cr , int32_t nc , int32_t pd , bool isCrawlbot , char format , // bool isJSON , TcpSocket *sock , bool isMasterAdmin , bool isCollAdmin ) { bool status = true; s_count = 0; // background color char *bg1 = LIGHT_BLUE; char *bg2 = DARK_BLUE; // background color char *bg = NULL; char *coll = NULL; if ( cr ) coll = cr->m_coll; // page aliases //if ( page == PAGE_COLLPASSWORDS ) // page = PAGE_MASTERPASSWORDS; if ( page == PAGE_COLLPASSWORDS2 ) page = PAGE_COLLPASSWORDS; GigablastRequest gr; g_parms.setToDefault ( (char *)&gr , OBJ_GBREQUEST , NULL); InjectionRequest ir; g_parms.setToDefault ( (char *)&ir , OBJ_IR , NULL); // Begin "parms":[] if (format == FORMAT_JSON ) { sb->safePrintf ("\"parms\":[\n"); } // find in parms list for ( int32_t i = 0 ; i < m_numParms ; i++ ) { // get it Parm *m = &m_parms[i]; // make sure we got the right parms for what we want if ( m->m_page != page ) continue; // and same object tpye. but allow OBJ_NONE for // PageAddUrl.cpp //if ( m->m_obj != parmObj && m->m_obj != OBJ_NONE ) continue; // skip if offset is negative, that means none // well then use OBJ_NONE now!!! //if ( m->m_off < 0 && // m->m_type != TYPE_MONOD2 && // m->m_type != TYPE_MONOM2 && // m->m_type != TYPE_CMD ) continue; // skip if hidden if ( m->m_flags & PF_HIDDEN ) continue; // or if should not show in html, like the // name of the collection, the "c" parm we do not show // generally on the html page even though it is a required parm // we have it in a hidden html input tag in Pages.cpp. if ( (m->m_flags & PF_NOHTML) && format != FORMAT_JSON && format != FORMAT_XML ) continue; // get right ptr char *THIS = NULL; if ( m->m_obj == OBJ_CONF ) THIS = (char *)&g_conf; if ( m->m_obj == OBJ_COLL ) { THIS = (char *)cr; if ( ! THIS ) continue; } if ( m->m_obj == OBJ_GBREQUEST ) THIS = (char *)&gr; if ( m->m_obj == OBJ_IR ) THIS = (char *)&ir; // might have an array, do not exceed the array size int32_t jend = m->m_max; int32_t size = jend ; char *ss = ((char *)THIS + m->m_off - 4); if ( m->m_type == TYPE_MONOD2 ) ss = NULL; if ( m->m_type == TYPE_MONOM2 ) ss = NULL; if ( m->m_max > 1 && ss ) size = *(int32_t *)ss; if ( size < jend ) jend = size; // toggle background color on group boundaries... if ( m->m_group == 1 ) { if ( bg == bg1 ) bg = bg2; else bg = bg1; } // // mdw just debug to here ... left off here //char *xx=NULL;*xx=0; // . do we have an array? if so print title on next row // UNLESS these are priority checkboxes, those can all // cluster together onto one row // . only add if not in a row of controls if ( m->m_max > 1 && m->m_type != TYPE_PRIORITY_BOXES && m->m_rowid == -1 && format != FORMAT_JSON && format != FORMAT_XML ) { // ! isJSON ) { // // make a separate table for array of parms sb->safePrintf ( //"<table width=100%% bgcolor=#d0d0e0 " //"cellpadding=4 border=1>\n" "<tr><td colspan=20 bgcolor=#%s>" "<center>" //"<font size=+1>" "<b>%s" "</b>" //"</font>" "</td></tr>\n" "<tr><td colspan=20><font size=-1>" ,DARK_BLUE,m->m_title); // print the description sb->safePrintf ( "%s" , m->m_desc ); // end the description sb->safePrintf("</font></td></tr>\n"); } // arrays always have blank line for adding stuff if ( m->m_max > 1 ) // not for PAGE_PRIORITIES! //m->m_page != PAGE_PRIORITIES ) size++; // if m_rowid of consecutive parms are the same then they // are all printed in the same row, otherwise the inner loop // has no effect int32_t rowid = m_parms[i].m_rowid; // if not part of a complex row, just print this array right up if ( rowid == -1 ) { for ( int32_t j = 0 ; j < size ; j++ ) status &=printParm ( sb,NULL,&m_parms[i],i, j, jend, (char *)THIS, coll,NULL, bg,nc,pd, false, isCrawlbot, format, isMasterAdmin, isCollAdmin, sock); continue; } // if not first in a row, skip it, we printed it already if ( i > 0 && m_parms[i-1].m_rowid == rowid ) continue; // otherwise print everything in the row for ( int32_t j = 0 ; j < size ; j++ ) { // flip j if in this page int32_t newj = j; //if ( m->m_page == PAGE_PRIORITIES ) // newj = size - 1 - j; for ( int32_t k = i ; k < m_numParms && m_parms[k].m_rowid == rowid; k++ ) { status &=printParm(sb,NULL,&m_parms[k],k, newj,jend,(char *)THIS,coll,NULL, bg,nc,pd, j==size-1, isCrawlbot,format, isMasterAdmin, isCollAdmin, sock); } } // end array table //if ( m->m_max > 1 ) { // sprintf ( p , "</table><br>\n"); // p += gbstrlen ( p ); //} } // end "parms":[] if ( format == FORMAT_JSON ) { if ( m_numParms != 0 ) sb->m_length -= 2; sb->safePrintf("\n]\n"); } return status; } bool Parms::printParm ( SafeBuf* sb, //int32_t user , char *username, Parm *m , int32_t mm , // m = &m_parms[mm] int32_t j , int32_t jend , char *THIS , char *coll , char *pwd , char *bg , int32_t nc , // # column? int32_t pd , // print description bool lastRow , bool isCrawlbot , //bool isJSON ) { char format , bool isMasterAdmin , bool isCollAdmin , TcpSocket *sock ) { bool status = true; // do not print if no permissions //if ( m->m_perms != 0 && !g_users.hasPermission(username,m->m_perms) ) // return status; //if ( m->m_perms != 0 && (m->m_perms & user) == 0 ) return status; // do not print some if #define _CLIENT_ is true //#ifdef _GLOBALSPEC_ //if ( m->m_priv == 2 ) return status; //if ( m->m_priv == 3 ) return status; //#elif _CLIENT_ //if ( m->m_priv ) return status; //#elif _METALINCS_ //if ( m->m_priv == 2 ) return status; //if ( m->m_priv == 3 ) return status; //#endif // priv of 4 means do not print at all if ( m->m_priv == 4 ) return true; // do not print comments, those are for the xml conf file if ( m->m_type == TYPE_COMMENT ) return true; if ( m->m_flags & PF_HIDDEN ) return true; CollectionRec *cr = NULL; collnum_t collnum = -1; if ( coll ) { cr = g_collectiondb.getRec ( coll ); if ( cr ) collnum = cr->m_collnum; } if ( format == FORMAT_XML || format == FORMAT_JSON ) { // the upload button has no val, cmds too if ( m->m_type == TYPE_FILEUPLOADBUTTON ) return true; } int32_t page = m->m_page; if ( format == FORMAT_XML ) { sb->safePrintf ( "\t<parm>\n"); sb->safePrintf ( "\t\t<title><![CDATA["); sb->cdataEncode ( m->m_title ); sb->safePrintf ( "]]>\n"); sb->safePrintf ( "\t\tcdataEncode ( m->m_desc ); sb->safePrintf ( "]]>\n"); if ( m->m_flags & PF_REQUIRED ) sb->safePrintf("\t\t1\n"); sb->safePrintf ( "\t\t%s\n",m->m_cgi); // and default value if it exists char *def = m->m_def; if ( ! def ) def = ""; sb->safePrintf ( "\t\tcdataEncode ( def ); sb->safePrintf ( "]]>\n"); if ( page == PAGE_MASTER || page == PAGE_SEARCH || page == PAGE_SPIDER || page == PAGE_SPIDERPROXIES || page == PAGE_FILTERS || page == PAGE_MASTERPASSWORDS || page == PAGE_REPAIR || page == PAGE_LOG ) { sb->safePrintf ( "\t\tprintVal ( &xb , collnum , 0 );//occNum sb->cdataEncode ( xb.getBufStart() ); sb->safePrintf ( "]]>\n"); } sb->safePrintf ( "\t\n"); return true; } if ( format == FORMAT_JSON ) { sb->safePrintf ( "\t{\n"); sb->safePrintf ( "\t\t\"title\":\"%s\",\n",m->m_title); sb->safePrintf ( "\t\t\"desc\":\""); sb->jsonEncode ( m->m_desc ); sb->safePrintf("\",\n"); if ( m->m_flags & PF_REQUIRED ) sb->safePrintf("\t\t\"required\":1,\n"); sb->safePrintf ( "\t\t\"cgi\":\"%s\",\n",m->m_cgi); // and default value if it exists char *def = m->m_def; if ( ! def ) def = ""; sb->safePrintf ( "\t\t\"defaultValue\":\""); sb->jsonEncode(def); sb->safePrintf("\",\n"); if ( page == PAGE_MASTER || page == PAGE_SEARCH || page == PAGE_SPIDER || page == PAGE_SPIDERPROXIES || page == PAGE_FILTERS || page == PAGE_MASTERPASSWORDS || page == PAGE_REPAIR || page == PAGE_LOG ) { sb->safePrintf ( "\t\t\"currentValue\":\""); SafeBuf js; m->printVal ( &js , collnum , 0 );//occNum ); sb->jsonEncode(js.getBufStart()); sb->safePrintf("\",\n"); } sb->m_length -= 2; // hack of trailing comma sb->safePrintf("\n\t},\n"); return true; } // . if printing on crawlbot page hide these // . we repeat this logic below when printing parm titles // for the column headers in the table //char *vt = ""; //if ( isCrawlbot && // m->m_page == PAGE_FILTERS && // (strcmp(m->m_xml,"spidersEnabled") == 0 || // //strcmp(m->m_xml,"maxSpidersPerRule")==0|| // //strcmp(m->m_xml,"maxSpidersPerIp") == 0|| // strcmp(m->m_xml,"spiderIpWait") == 0 // ) ) // vt = " style=display:none;"; // what type of parameter? char t = m->m_type; // point to the data in THIS char *s = THIS + m->m_off + m->m_size * j ; // if THIS is NULL then it must be GigablastRequest or something // and is not really a persistent thing, but a one-shot deal. if ( ! THIS ) s = NULL; // . if an array, passed our end, this is the blank line at the end // . USE THIS EMPTY/DEFAULT LINE TO ADD NEW DATA TO AN ARRAY // . make at least as big as a int64_t if ( j >= jend ) s = "\0\0\0\0\0\0\0\0"; // delimit each cgi var if we need to if ( m->m_cgi && gbstrlen(m->m_cgi) > 45 ) { log(LOG_LOGIC,"admin: Cgi variable is TOO big."); char *xx = NULL; *xx = 0; } char cgi[64]; if ( m->m_cgi ) { if ( j > 0 ) sprintf ( cgi , "%s%"INT32"" , m->m_cgi , j ); else sprintf ( cgi , "%s" , m->m_cgi ); // let's try dropping the index # and just doing dup parms //sprintf ( cgi , "%s" , m->m_cgi ); } // . display title and description of the control/parameter // . the input cell of some parameters are colored char *color = ""; if ( t == TYPE_CMD || t == TYPE_BOOL2 ) color = " bgcolor=#6060ff"; if ( t == TYPE_BOOL ) { if ( *s ) color = " bgcolor=#00ff00"; else color = " bgcolor=#ff0000"; } if ( t == TYPE_BOOL || t == TYPE_BOOL2 ) { // disable controls not allowed in read only mode if ( g_conf.m_readOnlyMode && m->m_rdonly ) color = " bgcolor=#ffff00"; } bool firstInRow = false; if ( (s_count % nc) == 0 ) firstInRow = true; s_count++; if ( mm > 0 && m->m_rowid >= 0 && m_parms[mm-1].m_rowid == m->m_rowid ) firstInRow = false; int32_t firstRow = 0; //if ( m->m_page==PAGE_PRIORITIES ) firstRow = MAX_PRIORITY_QUEUES - 1; // . use a separate table for arrays // . make title and description header of that table // . do not print all headers if not m_hdrs, a special case for the // default line in the url filters table if ( j == firstRow && m->m_rowid >= 0 && firstInRow && m->m_hdrs ) { // print description as big comment if ( m->m_desc && pd == 1 ) { // url FILTERS table description row sb->safePrintf ( "" "\n" , DARK_BLUE); //p = htmlEncode ( p , pend , m->m_desc , // m->m_desc + gbstrlen ( m->m_desc ) ); sb->safePrintf ( "%s" , m->m_desc ); sb->safePrintf ( "" // for "#,expression,harvestlinks.." // header row in url FILTERS table "\n" ,DARK_BLUE); } // # column // do not show this for PAGE_PRIORITIES it is confusing if ( m->m_max > 1 ) { //m->m_page != PAGE_PRIORITIES ) { sb->safePrintf ( "#\n" ); } // print all headers for ( int32_t k = mm ; km_rowid; k++ ) { // parm int16_tcut Parm *mk = &m_parms[k]; // not if printing json //if ( format != FORMAT_HTML )continue;//isJSON ) // skip if hidden if ( cr && ! cr->m_isCustomCrawl && (mk->m_flags & PF_DIFFBOT) ) continue; // . hide table column headers that are too advanced // . we repeat this logic above for the actual parms //char *vt = ""; //if ( isCrawlbot && // m->m_page == PAGE_FILTERS && // (strcmp(mk->m_xml,"spidersEnabled") == 0 || // //strcmp(mk->m_xml,"maxSpidersPerRule")==0|| // //strcmp(mk->m_xml,"maxSpidersPerIp") == 0|| // strcmp(mk->m_xml,"spiderIpWait") == 0 ) ) // vt = " style=display:none;display:none;"; //sb->safePrintf ( "" , vt ); sb->safePrintf ( "" ); // if its of type checkbox in a table make it // toggle them all on/off if ( mk->m_type == TYPE_CHECKBOX && mk->m_page == PAGE_FILTERS ) { sb->safePrintf("", m_parms[k].m_cgi, m->m_max); } sb->safePrintf ( "%s", m_parms[k].m_title ); if ( mk->m_type == TYPE_CHECKBOX && mk->m_page == PAGE_FILTERS ) sb->safePrintf(""); /* if ( m->m_page == PAGE_PRIORITIES && m_parms[k].m_type == TYPE_CHECKBOX) sb->safePrintf("
(toggle)", m_parms[k].m_cgi, m->m_max); */ sb->safePrintf ("\n"); } //if ( format == FORMAT_HTML ) sb->safePrintf ( "\n" ); // mdw added } // skip if hidden. diffbot api url only for custom crawls. //if(cr && ! cr->m_isCustomCrawl && (m->m_flags & PF_DIFFBOT) ) // return true; // print row start for single parm if ( m->m_max <= 1 && ! m->m_hdrs ) { if ( firstInRow ) { sb->safePrintf ( "" , bg ); } sb->safePrintf ( "" , 100/nc/2 ); } // if parm value is not default, use orange! char rr[1024]; SafeBuf val1(rr,1024); if ( m->m_type != TYPE_FILEUPLOADBUTTON ) m->printVal ( &val1 , collnum , j ); // occNum ); // test it if ( m->m_def && m->m_obj != OBJ_NONE && m->m_obj != OBJ_IR && // do not do for injectionrequest m->m_obj != OBJ_GBREQUEST && // do not do for GigablastRequest strcmp ( val1.getBufStart() , m->m_def ) ) // put non-default valued parms in orange! bg = "ffa500"; // print the title/description in current table for non-arrays if ( m->m_max <= 1 && m->m_hdrs ) { // j == 0 && m->m_rowid < 0 ) { if ( firstInRow ) sb->safePrintf ( "",bg); if ( t == TYPE_STRINGBOX ) { sb->safePrintf ( "
" "%s
",m->m_title ); if ( pd ) { status &= sb->htmlEncode (m->m_desc, gbstrlen(m->m_desc), false); // is it required? if ( m->m_flags & PF_REQUIRED ) sb->safePrintf(" " "REQUIRED"); } sb->safePrintf ( "
\n" ); } if ( t != TYPE_STRINGBOX ) { // this td will be invisible if isCrawlbot and the // parm is too advanced to display sb->safePrintf ( "m_colspan > 0 ) sb->safePrintf ( "colspan=%"INT32" ", (int32_t)m->m_colspan); sb->safePrintf ( "width=%"INT32"%%>"//" "%s
", 3*100/nc/2/4, m->m_title ); // the "site list" parm has html in description if ( pd ) { status &= sb->safeStrcpy(m->m_desc); //status &= sb->htmlEncode (m->m_desc, // gbstrlen(m->m_desc), // false); // is it required? if ( m->m_flags & PF_REQUIRED ) sb->safePrintf(" " "REQUIRED"); // print users current ip if showing the list // of "Master IPs" for admin access if ( ( m->m_page == PAGE_MASTERPASSWORDS || m->m_page == PAGE_COLLPASSWORDS ) && sock && m->m_title && strstr(m->m_title,"IP") ) sb->safePrintf(" Your current IP " "is %s.", iptoa(sock->m_ip)); } // and cgi parm if it exists //if ( m->m_def && m->m_scgi ) // sb->safePrintf(" CGI override: %s.",m->m_scgi); // just let them see the api page for this... //sb->safePrintf(" CGI: %s.",m->m_cgi); // and default value if it exists if ( m->m_def && m->m_def[0] && t != TYPE_CMD ) { char *d = m->m_def; if ( t == TYPE_BOOL || t == TYPE_CHECKBOX ) { if ( d[0]=='0' ) d = "NO"; else d = "YES"; sb->safePrintf ( " " "Default: %s." "",d); } else { sb->safePrintf (" Default: "); status &= sb->htmlEncode (d, gbstrlen(d), false); } } sb->safePrintf ( "\n" , color , 100/nc/2/4 ); } } // . print number in row if array, start at 1 for clarity's sake // . used for url filters table, etc. if ( m->m_max > 1 ) { // bg color alternates char *bgc = LIGHT_BLUE; if ( j % 2 ) bgc = DARK_BLUE; // do not print this if doing json //if ( format != FORMAT_HTML );//isJSON ) ; // but if it is in same row as previous, do not repeat it // for this same row, silly if ( firstInRow ) // && m->m_page != PAGE_PRIORITIES ) sb->safePrintf ( "" "%"INT32"\n", bgc, j );//j+1 else if ( firstInRow ) sb->safePrintf ( "" ); else //sb->safePrintf ( "" , vt); sb->safePrintf ( "" ); } //int32_t cast = m->m_cast; //if ( g_proxy.isProxy() ) cast = 0; // print the input box if ( t == TYPE_BOOL ) { char *tt, *v; if ( *s ) { tt = "YES"; v = "0"; } else { tt = "NO" ; v = "1"; } if ( g_conf.m_readOnlyMode && m->m_rdonly ) sb->safePrintf ( "read-only mode" ); // if cast=1, command IS broadcast to all hosts else sb->safePrintf ( "" // &cast=%"INT32"\">" "
%s
", g_pages.getPath(m->m_page),coll, cgi,v,//cast, tt); } else if ( t == TYPE_BOOL2 ) { if ( g_conf.m_readOnlyMode && m->m_rdonly ) sb->safePrintf ( "
read-only mode" "
"); // always use m_def as the value for TYPE_BOOL2 else sb->safePrintf ( "" //"cast=1\">" "
%s
", g_pages.getPath(m->m_page),coll, cgi,m->m_def, m->m_title); } else if ( t == TYPE_CHECKBOX ) { //char *ddd1 = ""; //char *ddd2 = ""; //if ( *s ) ddd1 = " checked"; //else ddd2 = " checked"; // just show the parm name and value if printing in json // if ( format == FORMAT_JSON ) { // isJSON ) { // if ( ! lastRow ) { // int32_t val = 0; // if ( *s ) val = 1; // sb->safePrintf("\"%s\":%"INT32",\n",cgi,val); // } // } //sb->safePrintf("
"); sb->safePrintf(""); // this is part of the "HACK" fix below. you have to // specify the cgi parm in the POST request, and // unchecked checkboxes are not included in the POST // request. //if ( lastRow && m->m_page == PAGE_FILTERS ) // sb->safePrintf("m_obj == OBJ_NONE && m->m_def[0] != '0' ) val = " checked"; if ( m->m_obj != OBJ_NONE && s && *s ) val = " checked"; // s is NULL for GigablastRequest parms if ( ! s && m->m_def && m->m_def[0]=='1' ) val = " checked"; // in case it is not checked, submit that! // if it gets checked this should be overridden then sb->safePrintf("" , cgi ); //else sb->safePrintf("m_page == PAGE_FILTERS) sb->safePrintf("id=id_%s ",cgi); sb->safePrintf("name=%s%s" //" onmouseup=\"" //"if ( this.value=='N' ) {" //"this.value='Y';" //"} " //"else if ( this.value=='Y' ) {" //"this.value='N';" //"}" //"\" " ">" ,cgi ,val);//,ddd); // // repeat for off position // //if ( ! lastRow || m->m_page != PAGE_FILTERS ) { // sb->safePrintf(" Off:m_page == PAGE_FILTERS) // sb->safePrintf("id=id_%s ",cgi); // sb->safePrintf("value=0 name=%s%s>", // cgi,ddd2); //} sb->safePrintf("" //"
" ); } else if ( t == TYPE_CHAR ) sb->safePrintf ("",cgi,(int32_t)(*s)); /* else if ( t == TYPE_CHAR2 ) sprintf (p,"",cgi,*(char*)s);*/ else if ( t == TYPE_PRIORITY ) printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s , false , false ); else if ( t == TYPE_PRIORITY2 ) { // just show the parm name and value if printing in json // if ( format==FORMAT_JSON) // isJSON ) // sb->safePrintf("\"%s\":%"INT32",\n",cgi,(int32_t)*(char *)s); // else printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s , true , true ); } // this url filters parm is an array of SAFEBUFs now, so each is // a string and that string is the diffbot api url to use. // the string is empty or zero length to indicate none. //else if ( t == TYPE_DIFFBOT_DROPDOWN ) { // char *xx=NULL;*xx=0; //} //else if ( t == TYPE_UFP ) else if ( t == TYPE_SAFEBUF && strcmp(m->m_title,"url filters profile")==0) // url filters profile drop down "ufp" printDropDownProfile ( sb , "ufp" , cr );//*s ); // do not expose master passwords or IPs to non-root admins else if ( ( m->m_flags & PF_PRIVATE ) && m->m_obj == OBJ_CONF && ! isMasterAdmin ) return true; // do not expose master passwords or IPs to non-root admins else if ( ( m->m_flags & PF_PRIVATE ) && m->m_obj == OBJ_COLL && ! isCollAdmin ) return true; else if ( t == TYPE_RETRIES ) printDropDown ( 4 , sb , cgi , *s , false , false ); else if ( t == TYPE_FILEUPLOADBUTTON ) { sb->safePrintf("",cgi); } else if ( t == TYPE_PRIORITY_BOXES ) { // print ALL the checkboxes when we get the first parm if ( j != 0 ) return status; printCheckBoxes ( MAX_SPIDER_PRIORITIES , sb , cgi , s ); } else if ( t == TYPE_CMD ) // if cast=0 it will be executed, otherwise it will be // broadcasted with cast=1 to all hosts and they will all // execute it sb->safePrintf ( "" // cast=%"INT32" "
%s
", g_pages.getPath(m->m_page),coll, cgi,m->m_title); else if ( t == TYPE_FLOAT ) { // just show the parm name and value if printing in json // if ( format == FORMAT_JSON )//isJSON ) // sb->safePrintf("\"%s\":%f,\n",cgi,*(float *)s); // else sb->safePrintf ("",cgi,*(float *)s); } else if ( t == TYPE_IP ) { if ( m->m_max > 0 && j == jend ) sb->safePrintf ("",cgi); else sb->safePrintf ("",cgi,iptoa(*(int32_t *)s)); } else if ( t == TYPE_LONG ) { // just show the parm name and value if printing in json // if ( format == FORMAT_JSON ) // isJSON ) // sb->safePrintf("\"%s\":%"INT32",\n",cgi,*(int32_t *)s); // else sb->safePrintf ("",cgi,*(int32_t *)s); } else if ( t == TYPE_LONG_CONST ) sb->safePrintf ("%"INT32"",*(int32_t *)s); else if ( t == TYPE_LONG_LONG ) sb->safePrintf ("",cgi,*(int64_t *)s); else if ( t == TYPE_STRING || t == TYPE_STRINGNONEMPTY ) { int32_t size = m->m_size; // give regular expression box on url filters page more room //if ( m->m_page == PAGE_FILTERS ) { // if ( size > REGEX_TXT_MAX ) size = REGEX_TXT_MAX; //} //else { if ( size > 20 ) size = 20; //} sb->safePrintf ("m_flags & PF_COLLDEFAULT) ) sb->safePrintf("%s",cr->m_coll); else sb->dequote ( s , gbstrlen(s) ); sb->safePrintf ("\">"); } // HACK: print a drop down not a textbox for selecting the // m_spiderDiffbotApiUrl[]. we can't just store this selection // as a number because m_diffbotApiList (a string of comma separated // items to select from) can change! it is not a typical dropdown. // so we have to record the actual text we selected, which is // basically the diffbot api url. this is because john can add // custom diffbot api urls at anytime to the list. /* else if ( t == TYPE_SAFEBUF && strcmp(m->m_cgi,"dapi") == 0 ) { SafeBuf *sx = (SafeBuf *)s; // just show the parm name and value if printing in json if ( isJSON ) { // this can be empty for the empty row i guess if ( sx->length() ) { // convert diffbot # to string sb->safePrintf("\"%s\":\"",cgi); // this is just the url path, not the title // of the menu option... so this would be // like "/api/article?u=" sb->safeUtf8ToJSON (sx->getBufStart() ); sb->safePrintf("\",\n"); } } else printDiffbotDropDown ( sb , cgi , THIS , sx ); } */ else if ( t == TYPE_CHARPTR ) { int32_t size = m->m_size; char *sp = NULL; if ( s && *s ) sp = *(char **)s; if ( ! sp ) sp = ""; if ( m->m_flags & PF_TEXTAREA ) { sb->safePrintf (""); } else { sb->safePrintf ("m_flags & PF_COLLDEFAULT) ) sb->safePrintf("%s",cr->m_coll); else if ( sp ) sb->dequote ( sp , gbstrlen(sp) ); sb->safePrintf ("\">"); } } else if ( t == TYPE_SAFEBUF ) { int32_t size = m->m_size; // give regular expression box on url filters page more room if ( m->m_page == PAGE_FILTERS ) { //if ( size > REGEX_TXT_MAX ) size = REGEX_TXT_MAX; size = 40; } else { if ( size > 20 ) size = 20; } SafeBuf *sx = (SafeBuf *)s; SafeBuf tmp; // if printing a parm in a one-shot deal like GigablastRequest // then s and sx will always be NULL, so set to default if ( ! sx ) { sx = &tmp; char *def = m->m_def; // if it has PF_DEFAULTCOLL flag set then use the coll if ( cr && (m->m_flags & PF_COLLDEFAULT) ) def = cr->m_coll; tmp.safePrintf("%s",def); } // just show the parm name and value if printing in json // if ( format == FORMAT_JSON ) { // isJSON ) { // // this can be empty for the empty row i guess // if ( sx->length() ) { // // convert diffbot # to string // sb->safePrintf("\"%s\":\"",cgi); // if ( m->m_obj != OBJ_NONE ) // sb->safeUtf8ToJSON (sx->getBufStart()); // sb->safePrintf("\",\n"); // } // } if ( m->m_flags & PF_TEXTAREA ) { int rows = 10; if ( m->m_flags & PF_SMALLTEXTAREA ) rows = 4; sb->safePrintf (""); } else { sb->safePrintf ("dequote ( s , gbstrlen(s) ); // note it //log("hack: %s",sx->getBufStart()); if ( cr && (m->m_flags & PF_COLLDEFAULT) && sx && sx->length() <= 0 ) sb->dequote ( cr->m_coll,gbstrlen(cr->m_coll)); // if parm is OBJ_NONE there is no stored valued else if ( m->m_obj != OBJ_NONE ) sb->dequote ( sx->getBufStart(), sx->length()); sb->safePrintf ("\">"); } } else if ( t == TYPE_STRINGBOX ) { sb->safePrintf("\n"); } else if ( t == TYPE_CONSTANT ) sb->safePrintf ("%s",m->m_title); else if ( t == TYPE_MONOD2 ) sb->safePrintf ("%"INT32"",j / 2 ); else if ( t == TYPE_MONOM2 ) { /* if ( m->m_page == PAGE_PRIORITIES ) { if ( j % 2 == 0 ) sb->safePrintf ("old"); else sb->safePrintf ("new"); } else */ sb->safePrintf ("%"INT32"",j % 2 ); } else if ( t == TYPE_RULESET ) ; // subscript is already included in "cgi" //g_pages.printRulesetDropDown ( sb , // user , // cgi , // *(int32_t *)s , // selected // -1 ); // subscript else if ( t == TYPE_TIME ) { //time is stored as a string //if time is not stored properly, just write 00:00 if ( s[2] != ':' ) strncpy ( s, "00:00", 5 ); char hr[3]; char min[3]; gbmemcpy ( hr, s, 2 ); gbmemcpy ( min, s + 3, 2 ); hr[2] = '\0'; min[2] = '\0'; // print the time in the input forms sb->safePrintf("h " "m " , cgi , hr , cgi , min ); } else if ( t == TYPE_DATE || t == TYPE_DATE2 ) { // time is stored as int32_t int32_t ct = *(int32_t *)s; // get the time struct struct tm *tp = gmtime ( (time_t *)&ct ) ; // set the "selected" month for the drop down char *ss[12]; for ( int32_t i = 0 ; i < 12 ; i++ ) ss[i]=""; int32_t month = tp->tm_mon; if ( month < 0 || month > 11 ) month = 0; // Jan ss[month] = " selected"; // print the date in the input forms sb->safePrintf( " " "\n" "" "
" "h " "m " "s" , cgi , (int32_t)tp->tm_mday , cgi , ss[0],ss[1],ss[2],ss[3],ss[4],ss[5],ss[6],ss[7],ss[8], ss[9],ss[10],ss[11], cgi , (int32_t)tp->tm_year + 1900 , cgi , (int32_t)tp->tm_hour , cgi , (int32_t)tp->tm_min , cgi , (int32_t)tp->tm_sec ); /* if ( t == TYPE_DATE2 ) { p += gbstrlen ( p ); // a int32_t after the int32_t is used for this int32_t ct = *(int32_t *)(THIS+m->m_off+4); char *ss = ""; if ( ct ) ss = " checked"; sprintf ( p , "
use current " "time\n",cgi,ss); } */ } else if ( t == TYPE_SITERULE ) { // print the siterec rules as a drop down char *ss[5]; for ( int32_t i = 0; i < 5; i++ ) ss[i] = ""; int32_t v = *(int32_t*)s; if ( v < 0 || v > 4 ) v = 0; ss[v] = " selected"; sb->safePrintf ( "\n", cgi, ss[0], ss[1], ss[2], ss[3] ); } // end the input cell sb->safePrintf ( "\n"); // "insert above" link? used for arrays only, where order matters if ( m->m_addin && j < jend ) {//! isJSON ) { sb->safePrintf ( "insert\n",coll,cgi ); // insert= // "j" is the row # "insert=%"INT32"\">insert\n",coll,j ); } // does next guy start a new row? bool lastInRow = true; // assume yes if (mm+1m_rowid>=0&&m_parms[mm+1].m_rowid==m->m_rowid) lastInRow = false; if ( ((s_count-1) % nc) != (nc-1) ) lastInRow = false; // . display the remove link for arrays if we need to // . but don't display if next guy does NOT start a new row //if ( m->m_max > 1 && lastInRow && ! isJSON ) { if ( m->m_addin && j < jend ) { //! isJSON ) { // m->m_page != PAGE_PRIORITIES ) { // show remove link? bool show = true; //if ( j >= jend ) show = false; // get # of rows int32_t *nr = (int32_t *)((char *)THIS + m->m_off - 4); // are we the last row? bool lastRow = false; // yes, if this is true if ( j == *nr - 1 ) lastRow = true; // do not allow removal of last default url filters rule //if ( lastRow && !strcmp(m->m_cgi,"fsp")) show = false; char *suffix = ""; if ( m->m_page == PAGE_MASTERPASSWORDS && m->m_type == TYPE_IP ) suffix = "ip"; if ( m->m_page == PAGE_MASTERPASSWORDS && m->m_type == TYPE_STRINGNONEMPTY ) suffix = "pwd"; if ( show ) sb->safePrintf ("" // remove= "remove%s=%"INT32"\">" "remove\n",coll,//cgi ); suffix, j); // j is row # else sb->safePrintf ( "\n"); } if ( lastInRow ) sb->safePrintf ("\n"); return status; } /* // get the object of our desire char *Parms::getTHIS ( HttpRequest *r , int32_t page ) { // if not master controls, must be a collection rec //if ( page < PAGE_CGIPARMS ) return (char *)&g_conf; char *coll = r->getString ( "c" ); // support john wanting to use "id" for the crawl id which is really // the collection id, hopefully won't conflict with other things. if ( ! coll ) coll = r->getString ( "id" ); if ( ! coll || ! coll[0] ) //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) log("admin: Collection \"%s\" not found.", r->getString("c") ); return (char *)cr; } */ // now we use this to set SearchInput and GigablastRequest bool Parms::setFromRequest ( HttpRequest *r , TcpSocket* s, CollectionRec *newcr , char *THIS , int32_t objType ) { // get the page from the path... like /sockets --> PAGE_SOCKETS //int32_t page = g_pages.getDynamicPageNumber ( r ); // use convertHttpRequestToParmList() for these because they // are persistent records that are updated on every shard. if ( objType == OBJ_COLL ) { char *xx=NULL;*xx=0; } if ( objType == OBJ_CONF ) { char *xx=NULL;*xx=0; } // ensure valid if ( ! THIS ) { // it is null when no collection explicitly specified... log(LOG_LOGIC,"admin: THIS is null for setFromRequest"); char *xx=NULL;*xx=0; } // need this for searchInput which takes default from "cr" //CollectionRec *cr = g_collectiondb.getRec ( r , true ); // no SearchInput.cpp does this and then overrides if xml feed // to set m_docsToScanForTopics //setToDefault ( THIS , objType , cr ); // loop through cgi parms for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) { // get cgi parm name char *field = r->getField ( i ); // find in parms list int32_t j; Parm *m; for ( j = 0 ; j < m_numParms ; j++ ) { // get it m = &m_parms[j]; // skip if not our type if ( m->m_obj != objType ) continue; // skip if offset is negative, that means none if ( m->m_off < 0 ) continue; // skip if no cgi parm, may not be configurable now if ( ! m->m_cgi ) continue; // otherwise, must match the cgi name exactly if ( strcmp ( field,m->m_cgi ) == 0 ) break; } // bail if the cgi field is not in the parms list if ( j >= m_numParms ) continue; // get the value of cgi parm (null terminated) char *v = r->getValue ( i ); // empty? if ( ! v ) continue; // . skip if no value was provided // . unless it was a string! so we can make them empty. if ( v[0] == '\0' && m->m_type != TYPE_STRING && m->m_type != TYPE_STRINGBOX ) continue; // set it setParm ( (char *)THIS , m, j, 0, v, false,//not html enc false );//true ); } return true; } bool Parms::insertParm ( int32_t i , int32_t an , char *THIS ) { Parm *m = &m_parms[i]; // . shift everyone above down // . first int32_t at offset is always the count // for arrays char *pos = (char *)THIS + m->m_off ; int32_t num = *(int32_t *)(pos - 4); // ensure we are valid if ( an >= num || an < 0 ) { log("admin: Invalid insertion of element " "%"INT32" in array of size %"INT32" for \"%s\".", an,num,m->m_title); return false; } // also ensure that we have space to put the parm in, because in // case of URl filters, it is bounded by MAX_FILTERS if ( num >= MAX_FILTERS ){ log("admin: Invalid insert of element %"INT32", array is full " "in size %"INT32" for \"%s\".",an, num, m->m_title); return false; } // point to the place where the element is to be inserted char *src = pos + m->m_size * an; //point to where it is to be moved char *dst = pos + m->m_size * ( an + 1 ); // how much to move int32_t size = ( num - an ) * m->m_size ; // move them memmove ( dst , src , size ); // if the src was a TYPE_SAFEBUF clear it so we don't end up doing // a double free, etc.! memset ( src , 0 , m->m_size ); // inc the count *(int32_t *)(pos-4) = (*(int32_t *)(pos-4)) + 1; // put the defaults in the inserted line setParm ( (char *)THIS , m , i , an , m->m_def , false ,false ); return true; } bool Parms::removeParm ( int32_t i , int32_t an , char *THIS ) { Parm *m = &m_parms[i]; // . shift everyone above down // . first int32_t at offset is always the count // for arrays char *pos = (char *)THIS + m->m_off ; int32_t num = *(int32_t *)(pos - 4); // ensure we are valid if ( an >= num || an < 0 ) { log("admin: Invalid removal of element " "%"INT32" in array of size %"INT32" for \"%s\".", an,num,m->m_title); return false; } // point to the element being removed char *dst = pos + m->m_size * an; // free memory pointed to by safebuf, if we are safebuf, before // overwriting it... prevents a memory leak if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *dx = (SafeBuf *)dst; dx->purge(); } // then point to the good stuf char *src = pos + m->m_size * (an+1); // how much to bury it with int32_t size = (num - an - 1 ) * m->m_size ; // bury it gbmemcpy ( dst , src , size ); // and detach the buf on the tail so it doesn't core in Mem.cpp // when it tries to free... if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *tail = (SafeBuf *)(pos + m->m_size * (num-1)); tail->detachBuf(); } // dec the count *(int32_t *)(pos-4) = (*(int32_t *)(pos-4)) - 1; return true; } void Parms::setParm ( char *THIS , Parm *m , int32_t mm , int32_t j , char *s , bool isHtmlEncoded , bool fromRequest ) { if ( fromRequest ) { char *xx=NULL;*xx=0; } // . this is just for setting CollectionRecs, so skip if offset < 0 // . some parms are just for SearchInput (search parms) if ( m->m_off < 0 ) return; if ( m->m_obj == OBJ_NONE ) return ; float oldVal = 0; float newVal = 0; if ( ! s && m->m_type != TYPE_CHARPTR && m->m_type != TYPE_FILEUPLOADBUTTON && m->m_defOff==-1) { s = "0"; char *tit = m->m_title; if ( ! tit || ! tit[0] ) tit = m->m_xml; log(LOG_LOGIC,"admin: Parm \"%s\" had NULL default value. " "Forcing to 0.", tit); //char *xx = NULL; *xx = 0; } // sanity check if ( &m_parms[mm] != m ) { log(LOG_LOGIC,"admin: Not sane parameters."); char *xx = NULL; *xx = 0; } // if attempting to add beyond array max, bail out if ( j >= m->m_max && j >= m->m_fixed ) { log ( "admin: Attempted to set parm beyond limit. Aborting." ); return; } // if we are setting a guy in an array AND he is NOT the first // in his row, ensure the guy before has a count of j+1 or more. // // crap, on the url filters page if you do not check "spidering // enabled" checkbox when adding a new rule at the bottom of the // table, , then the spidering enabled parameter does not transmit so // the "respider frequency" ends up checking the "spider enabled" // array whose "count" was not incremented like it should have been. // HACK: make new line at bottom always have spidering enabled // checkbox set and make it impossible to unset. /* if ( m->m_max > 1 && m->m_rowid >= 0 && mm > 0 && m_parms[mm-1].m_rowid == m->m_rowid ) { char *pos = (char *)THIS + m_parms[mm-1].m_off - 4 ; int32_t maxcount = *(int32_t *)pos; if ( j >= maxcount ) { log("admin: parm before \"%s\" is limiting us", m_parms[mm-1].m_title); //log("admin: try nuking the url filters or whatever " // "and re-adding"); return; } } */ // ensure array count at least j+1 if ( m->m_max > 1 ) { // . is this element we're adding bumping up the count? // . array count is 4 bytes before the array char *pos = (char *)THIS + m->m_off - 4 ; // set the count to it if it is bigger than current count if ( j + 1 > *(int32_t *)pos ) *(int32_t *)pos = j + 1; } char t = m->m_type; if ( t == TYPE_CHAR || t == TYPE_CHAR2 || t == TYPE_CHECKBOX || t == TYPE_BOOL || t == TYPE_BOOL2 || t == TYPE_PRIORITY || t == TYPE_PRIORITY2 || //t == TYPE_DIFFBOT_DROPDOWN || t == TYPE_UFP || t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES || t == TYPE_FILTER ) { if ( fromRequest && *(char *)(THIS + m->m_off + j) == atol(s)) return; if ( fromRequest)oldVal = (float)*(char *)(THIS + m->m_off +j); *(char *)(THIS + m->m_off + j) = atol ( s ); newVal = (float)*(char *)(THIS + m->m_off + j); goto changed; } else if ( t == TYPE_CHARPTR ) { // "s" might be NULL or m->m_def... *(char **)(THIS + m->m_off + j) = s; } else if ( t == TYPE_FILEUPLOADBUTTON ) { // "s" might be NULL or m->m_def... *(char **)(THIS + m->m_off + j) = s; } else if ( t == TYPE_CMD ) { log(LOG_LOGIC, "conf: Parms: TYPE_CMD is not a cgi var."); return; } else if ( t == TYPE_DATE2 || t == TYPE_DATE ) { int32_t v = (int32_t)atotime ( s ); if ( fromRequest && *(int32_t *)(THIS + m->m_off + 4*j) == v ) return; *(int32_t *)(THIS + m->m_off + 4*j) = v; if ( v < 0 ) log("conf: Date for <%s> of \"" "%s\" is not in proper format like: " "01 Jan 1980 22:45",m->m_xml,s); goto changed; } else if ( t == TYPE_FLOAT ) { if( fromRequest && *(float *)(THIS + m->m_off + 4*j) == (float)atof ( s ) ) return; // if changed within .00001 that is ok too, do not count // as changed, the atof() has roundoff errors //float curVal = *(float *)(THIS + m->m_off + 4*j); //float newVal = atof(s); //if ( newVal < curVal && newVal + .000001 >= curVal ) return; //if ( newVal > curVal && newVal - .000001 <= curVal ) return; if ( fromRequest ) oldVal = *(float *)(THIS + m->m_off + 4*j); *(float *)(THIS + m->m_off + 4*j) = (float)atof ( s ); newVal = *(float *)(THIS + m->m_off + 4*j); goto changed; } else if ( t == TYPE_DOUBLE ) { if( fromRequest && *(double *)(THIS + m->m_off + 4*j) == (double)atof ( s ) ) return; if ( fromRequest ) oldVal = *(double *)(THIS + m->m_off + 4*j); *(double *)(THIS + m->m_off + 4*j) = (double)atof ( s ); newVal = *(double *)(THIS + m->m_off + 4*j); goto changed; } else if ( t == TYPE_IP ) { if ( fromRequest && *(int32_t *)(THIS + m->m_off + 4*j) == (int32_t)atoip (s,gbstrlen(s) ) ) return; *(int32_t *)(THIS + m->m_off + 4*j) = (int32_t)atoip (s,gbstrlen(s) ); goto changed; } else if ( t == TYPE_LONG || t == TYPE_LONG_CONST || t == TYPE_RULESET|| t == TYPE_SITERULE ) { int32_t v = atol ( s ); // min is considered valid if >= 0 if ( m->m_min >= 0 && v < m->m_min ) v = m->m_min; if ( fromRequest && *(int32_t *)(THIS + m->m_off + 4*j) == v ) return; if ( fromRequest)oldVal=(float)*(int32_t *)(THIS + m->m_off +4*j); *(int32_t *)(THIS + m->m_off + 4*j) = v; newVal = (float)*(int32_t *)(THIS + m->m_off + 4*j); goto changed; } else if ( t == TYPE_LONG_LONG ) { if ( fromRequest && *(uint64_t *)(THIS + m->m_off+8*j)== strtoull(s,NULL,10)) return; *(int64_t *)(THIS + m->m_off + 8*j) = strtoull(s,NULL,10); goto changed; } // like TYPE_STRING but dynamically allocates else if ( t == TYPE_SAFEBUF ) { int32_t len = gbstrlen(s); // no need to truncate since safebuf is dynamic //if ( len >= m->m_size ) len = m->m_size - 1; // truncate!! //char *dst = THIS + m->m_off + m->m_size*j ; // point to the safebuf, in the case of an array of // SafeBufs "j" is the # in the array, starting at 0 SafeBuf *sb = (SafeBuf *)(THIS+m->m_off+(j*sizeof(SafeBuf)) ); int32_t oldLen = sb->length(); // why was this commented out??? we need it now that we // send email alerts when parms change! if ( fromRequest && ! isHtmlEncoded && oldLen == len && memcmp ( sb->getBufStart() , s , len ) == 0 ) return; // nuke it sb->purge(); // this means that we can not use string POINTERS as parms!! if ( ! isHtmlEncoded ) sb->safeMemcpy ( s , len ); else len = sb->htmlDecode (s,len,false,0); // tag it sb->setLabel ( "parm1" ); // ensure null terminated sb->nullTerm(); // note it //log("hack: %s",s); // null term it all //dst[len] = '\0'; //sb->reserve ( 1 ); // null terminate but do not include as m_length so the // memcmp() above still works right //sb->m_buf[sb->m_length] = '\0'; // . might have to set length // . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen //if ( m->m_plen >= 0 ) // *(int32_t *)(THIS + m->m_plen) = len ; goto changed; } else if ( t == TYPE_STRING || t == TYPE_STRINGBOX || t == TYPE_STRINGNONEMPTY || t == TYPE_TIME ) { int32_t len = gbstrlen(s); if ( len >= m->m_size ) len = m->m_size - 1; // truncate!! char *dst = THIS + m->m_off + m->m_size*j ; // why was this commented out??? we need it now that we // send email alerts when parms change! if ( fromRequest && ! isHtmlEncoded && (int32_t)gbstrlen(dst) == len && memcmp ( dst , s , len ) == 0 ) return; // this means that we can not use string POINTERS as parms!! if ( ! isHtmlEncoded ) {gbmemcpy ( dst , s , len ); } else len = htmlDecode (dst , s,len,false,0); dst[len] = '\0'; // . might have to set length // . used for CollectionRec::m_htmlHeadLen and m_htmlTailLen if ( m->m_plen >= 0 ) *(int32_t *)(THIS + m->m_plen) = len ; goto changed; } changed: // tell gigablast the value is EXPLICITLY given -- no longer based // on default.conf //if ( m->m_obj == OBJ_COLL ) ((CollectionRec *)THIS)->m_orig[mm] = 2; // we do not recognize timezones correctly when this is serialized // into coll.conf, it says UTC, which is ignored in HttpMime.cpp's // atotime() function. and when we submit it i think we use the // local time zone, so the values end up changing every time we // submit!!! i think it might read it in as UTC then write it out // as local time, or vice versa. if ( t == TYPE_DATE || t == TYPE_DATE2 ) return; // do not send if setting from startup if ( ! fromRequest ) return; // note it in the log log("admin: parm \"%s\" changed value",m->m_title); int64_t nowms = gettimeofdayInMillisecondsLocal(); // . note it in statsdb // . record what parm change and from/to what value g_statsdb.addStat ( 0, // niceness , "parm_change" , nowms, nowms, 0 , // value m->m_hash , // parmHash oldVal, newVal); // if they turn spiders on or off then tell spiderloop to update // the active list //if ( strcmp(m->m_cgi,"cse") ) // g_spiderLoop.m_activeListValid = false; // only send email alerts if we are host 0 since everyone syncs up // with host #0 anyway if ( g_hostdb.m_hostId != 0 ) return; // send an email alert notifying the admins that this parm was changed // BUT ALWAYS send it if email alerts were just TURNED OFF // ("sea" = Send Email Alerts) if ( ! g_conf.m_sendEmailAlerts && strcmp(m->m_cgi,"sea") != 0 ) return; // if spiders we turned on, do not send an email alert, cuz we // turn them on when we restart the cluster if ( strcmp(m->m_cgi,"se")==0 && g_conf.m_spideringEnabled ) return; char tmp[1024]; Host *h0 = g_hostdb.getHost ( 0 ); int32_t ip0 = 0; if ( h0 ) ip0 = h0->m_ip; sprintf(tmp,"%s: parm \"%s\" changed value",iptoa(ip0),m->m_title); g_pingServer.sendEmail ( NULL , // Host ptr tmp , // msg true , // sendToAdmin false , // oom? false , // kernel error? true , // parm change? true );// force it? even if disabled? // now the spider collection can just check the collection rec //int64_t nowms = gettimeofdayInMilliseconds(); //((CollectionRec *)THIS)->m_lastUpdateTime = nowms; return; } Parm *Parms::getParmFromParmHash ( int32_t parmHash ) { for ( int32_t i = 0 ; i < m_numParms ; i++ ) { Parm *m = &m_parms[i]; if ( m->m_hash != parmHash ) continue; return m; } return NULL; } void Parms::setToDefault ( char *THIS , char objType , CollectionRec *argcr ) { // init if we should init(); // . clear out any coll rec to get the diffbotApiNum dropdowns // . this is a backwards-compatibility hack since this new parm // will not be in old coll.conf files and will not be properly // initialize when displaying a url filter row. //if ( THIS != (char *)&g_conf ) { // CollectionRec *cr = (CollectionRec *)THIS; // memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS); //} for ( int32_t i = 0 ; i < m_numParms ; i++ ) { Parm *m = &m_parms[i]; if ( m->m_obj != objType ) continue; if ( m->m_obj == OBJ_NONE ) continue; if ( m->m_type == TYPE_COMMENT ) continue; // no, we gotta set GigablastRequest::m_contentFile to NULL //if ( m->m_type == TYPE_FILEUPLOADBUTTON ) // continue; if ( m->m_type == TYPE_MONOD2 ) continue; if ( m->m_type == TYPE_MONOM2 ) continue; if ( m->m_type == TYPE_CMD ) continue; if (THIS == (char *)&g_conf && m->m_obj != OBJ_CONF ) continue; if (THIS != (char *)&g_conf && m->m_obj == OBJ_CONF ) continue; // what is this? //if ( m->m_obj == OBJ_COLL ) { // CollectionRec *cr = (CollectionRec *)THIS; // if ( cr->m_bases[1] ) { char *xx=NULL;*xx=0; } //} // sanity check, make sure it does not overflow if ( m->m_obj == OBJ_COLL && m->m_off > (int32_t)sizeof(CollectionRec)){ log(LOG_LOGIC,"admin: Parm in Parms.cpp should use " "OBJ_COLL not OBJ_CONF"); char *xx = NULL; *xx = 0; } //if ( m->m_page == PAGE_PRIORITIES ) // log("hey"); // or if ( m->m_page > PAGE_API && // CGIPARMS && m->m_page != PAGE_NONE && m->m_obj == OBJ_CONF ) { log(LOG_LOGIC,"admin: Page can not reference " "g_conf and be declared AFTER PAGE_CGIPARMS in " "Pages.h. Title=%s",m->m_title); char *xx = NULL; *xx = 0; } // if defOff >= 0 get from cr like for searchInput vals // whose default is from the collectionRec... if ( m->m_defOff >= 0 && argcr ) { if ( ! argcr ) { char *xx=NULL;*xx=0; } char *def = m->m_defOff+(char *)argcr; char *dst = (char *)THIS + m->m_off; gbmemcpy ( dst , def , m->m_size ); continue; } // leave arrays empty, set everything else to default if ( m->m_max <= 1 ) { //if ( i == 282 ) // "query" parm // log("hey"); //if ( ! m->m_def ) { char *xx=NULL;*xx=0; } setParm ( THIS , m, i, 0, m->m_def, false/*not enc.*/, false ); //((CollectionRec *)THIS)->m_orig[i] = 1; //m->m_orig = 0; // set in setToDefaults() } // these are special, fixed size arrays if ( m->m_fixed > 0 ) { for ( int32_t k = 0 ; k < m->m_fixed ; k++ ) { setParm(THIS,m,i,k,m->m_def,false/*not enc.*/, false); //m->m_orig = 0; // set in setToDefaults() //((CollectionRec *)THIS)->m_orig[i] = 1; } continue; } // make array sizes 0 if ( m->m_max <= 1 ) continue; // otherwise, array is not fixed size char *s = THIS + m->m_off ; // set count to 1 if a default is present //if ( m->m_def[0] ) *(int32_t *)(s-4) = 1; //else *(int32_t *)(s-4) = 0; *(int32_t *)(s-4) = 0; } } // . returns false and sets g_errno on error // . you should set your "THIS" to its defaults before calling this bool Parms::setFromFile ( void *THIS , char *filename , char *filenameDef , char objType ) { // make sure we're init'd init(); // let em know //if ( THIS == &g_conf) log (LOG_INIT,"conf: Reading %s." , filename ); // . let the log know what we are doing // . filename is NULL if a call from CollectionRec::setToDefaults() Xml xml; //char buf [ MAX_XML_CONF ]; SafeBuf sb; if ( filename&&!setXmlFromFile(&xml,filename,&sb)){//buf,MAX_XML_CONF)) log("parms: error setting from file %s: %s",filename, mstrerror(g_errno)); return false; } // . all the collectionRecs have the same default file in // the workingDir/collections/default.conf // . so use our built in buffer for that /* if ( THIS != &g_conf && ! m_isDefaultLoaded ) { m_isDefaultLoaded = true; File f; f.set ( filenameDef ); if ( ! f.doesExist() ) { log(LOG_INIT, "db: Default collection configuration file " "%s was not found. Newly created collections " "will use hard coded defaults.",f.getFilename()); goto skip; } if ( ! setXmlFromFile ( &m_xml2 , filenameDef , m_buf , MAX_XML_CONF ) ) return false; } skip: */ int32_t vlen; char *v ; //char c ; int32_t numNodes = xml.getNumNodes(); int32_t numNodes2 = m_xml2.getNumNodes(); // now set THIS based on the parameters in the xml file for ( int32_t i = 0 ; i < m_numParms ; i++ ) { // get it Parm *m = &m_parms[i]; if ( m->m_obj != objType ) continue; if ( m->m_obj == OBJ_NONE ) continue; //log(LOG_DEBUG, "Parms: %s: parm: %s", filename, m->m_xml); // . there are 2 object types, coll recs and g_conf, aka // OBJ_COLL and OBJ_CONF. // . make sure we got the right parms for what we want if ( THIS == &g_conf && m->m_obj != OBJ_CONF ) continue; if ( THIS != &g_conf && m->m_obj == OBJ_CONF ) continue; // skip comments and command if ( m->m_type == TYPE_COMMENT ) continue; if ( m->m_type == TYPE_FILEUPLOADBUTTON ) continue; if ( m->m_type == TYPE_MONOD2 ) continue; if ( m->m_type == TYPE_MONOM2 ) continue; if ( m->m_type == TYPE_CMD ) continue; if ( m->m_type == TYPE_CONSTANT ) continue; // these are special commands really if ( m->m_type == TYPE_BOOL2 ) continue; //if ( strcmp ( m->m_xml , "forceDeleteUrls" ) == 0 ) // log("got it"); // we did not get one from first xml file yet bool first = true; // array count int32_t j = 0; // node number int32_t nn = 0; // a tmp thingy char tt[1]; int32_t nb; int32_t newnn; loop: if ( m->m_obj == OBJ_NONE ) { char *xx=NULL;*xx=0; } // get xml node number of m->m_xml in the "xml" file newnn = xml.getNodeNum(nn,1000000,m->m_xml,gbstrlen(m->m_xml)); #ifdef _GLOBALSPEC_ if ( m->m_priv == 2 ) continue; if ( m->m_priv == 3 ) continue; #elif _CLIENT_ // always use default value if client not allowed control of if ( m->m_priv ) continue; #elif _METALINCS_ if ( m->m_priv == 2 ) continue; if ( m->m_priv == 3 ) continue; #endif // debug //log("%s --> %"INT32"",m->m_xml,nn); // try default xml file if none, but only if first try if ( newnn < 0 && first ) goto try2; // it is valid, use it nn = newnn; // set the flag, we've committed the array to the first file first = false; // otherwise, we had some in this file, but now we're out if ( nn < 0 ) continue; // . next node is the value of this tag // . skip if none there if ( nn + 1 >= numNodes ) continue; // point to it v = xml.getNode ( nn + 1 ); vlen = xml.getNodeLen ( nn + 1 ); // if a back tag... set the value to the empty string if ( v[0] == '<' && v[1] == '/' ) vlen = 0; // now, extricate from the tag if we need to if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_SAFEBUF || m->m_type == TYPE_STRINGNONEMPTY ) { char *oldv = v; int32_t oldvlen = vlen; // if next guy is NOT a tag node, try the next one if ( v[0] != '<' && nn + 2 < numNodes ) { v = xml.getNode ( nn + 2 ); vlen = xml.getNodeLen ( nn + 2 ); } // should be a if ( vlen<12 || strncasecmp(v," tag found " "for \"<%s>\" tag. Trying without CDATA.", m->m_xml); v = oldv; vlen = oldvlen; } // point to the nugget else { v += 9; vlen -= 12; } } // get the value //v = xml.getString ( nn , nn+2 , m->m_xml , &vlen ); // this only happens when tag is there, but without a value if ( ! v || vlen == 0 ) { vlen = 0; v = tt; } //c = v[vlen]; v[vlen]='\0'; if ( vlen == 0 ){ // . this is generally ok // . this is spamming the log so i am commenting out! (MDW) //log(LOG_INFO, "parms: %s: Empty value.", m->m_xml); // Allow an empty string //continue; } // now use proper cdata // we can't do this and be backwards compatible right now //nb = cdataDecode ( v , v , 0 );//, vlen , false ,0); // now decode it into itself nb = htmlDecode ( v , v , vlen , false ,0); v[nb] = '\0'; // set our parm setParm ( (char *)THIS, m, i, j, v, false/*is html encoded?*/, false ); // we were set from the explicit file //((CollectionRec *)THIS)->m_orig[i] = 2; // go back //v[vlen] = c; // do not repeat same node nn++; // try to get the next node if we're an array if ( ++j < m->m_max || j < m->m_fixed ) { goto loop; } // otherwise, if not an array, go to next parm continue; try2: // get xml node number of m->m_xml in the "m_xml" file nn = m_xml2.getNodeNum(nn,1000000,m->m_xml,gbstrlen(m->m_xml)); // otherwise, we had one in file, but now we're out if ( nn < 0 ) { // if it was ONLY a search input parm, with no // default value that can be changed in the // CollectionRec then skip it // if ( m->m_soff != -1 && // m->m_off == -1 && // m->m_smaxc == -1 ) // continue; // . if it is a string, like and default is // NULL then don't worry about reporting it // . no, just make the default "" then //if ( m->m_type==TYPE_STRING && ! m->m_def) continue; // bitch that it was not found //if ( ! m->m_def[0] ) // log("conf: %s does not have <%s> tag. " // "Omitting.",filename,m->m_xml); //else /* if ( ! m->m_def ) //m->m_def[0] ) log("conf: %s does not have <%s> tag. Using " "default value of \"%s\".", filename, m->m_xml,m->m_def); */ continue; } // . next node is the value of this tag // . skip if none there if ( nn + 1 >= numNodes2 ) continue; // point to it v = m_xml2.getNode ( nn + 1 ); vlen = m_xml2.getNodeLen ( nn + 1 ); // if a back tag... set the value to the empty string if ( v[0] == '<' && v[1] == '/' ) vlen = 0; // now, extricate from the tag if we need to if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_STRINGNONEMPTY ) { char *oldv = v; int32_t oldvlen = vlen; // reset if not a tag node if ( v[0] != '<' && nn + 2 < numNodes2 ) { v = m_xml2.getNode ( nn + 2 ); vlen = m_xml2.getNodeLen ( nn + 2 ); } // should be a if ( vlen<12 || strncasecmp(v," tag found " "for \"<%s>\" tag. Trying without CDATA.", m->m_xml); v = oldv; vlen = oldvlen; } // point to the nugget else { v += 9; vlen -= 12; } } // get the value //v = m_xml2.getString ( nn , nn+2 , m->m_xml , &vlen ); // this only happens when tag is there, but without a value if ( ! v || vlen == 0 ) { vlen = 0; v = tt; } //c = v[vlen]; v[vlen]='\0'; // now decode it into itself nb = htmlDecode ( v , v , vlen , false,0); v[nb] = '\0'; // set our parm setParm ( (char *)THIS, m, i, j, v, false/*is html encoded?*/, false ); // we were set from the backup default file //((CollectionRec *)THIS)->m_orig[i] = 1; // go back //v[vlen] = c; // do not repeat same node nn++; // try to get the next node if we're an array if ( ++j < m->m_max || j < m->m_fixed ) { goto loop; } // otherwise, if not an array, go to next parm continue; } // backwards compatible hack for old tags for ( int32_t i = 1 ; i < numNodes ; i++ ) { if ( objType != OBJ_CONF ) break; XmlNode *pn = &xml.m_nodes[i-1]; XmlNode *xn = &xml.m_nodes[i]; // look for if ( pn->m_tagNameLen != 14 ) continue; if ( xn->m_tagNameLen != 8 ) continue; // if it is not the OLD supported tag then skip if ( strncmp ( pn->m_tagName,"masterPassword",14 ) ) continue; if ( strncmp ( xn->m_tagName,"![CDATA[",8 ) ) continue; // otherwise append to buf char *text = xn->m_node + 9; int32_t tlen = xn->m_nodeLen - 12; g_conf.m_masterPwds.safeMemcpy(text,tlen); // a \n g_conf.m_masterPwds.pushChar('\n'); g_conf.m_masterPwds.nullTerm(); } // another backwards compatible hack for old masterIp tags for ( int32_t i = 1 ; i < numNodes ; i++ ) { if ( objType != OBJ_CONF ) break; XmlNode *xn = &xml.m_nodes[i]; XmlNode *pn = &xml.m_nodes[i-1]; // look for if ( pn->m_tagNameLen != 8 ) continue; if ( xn->m_tagNameLen != 8 ) continue; // if it is not the OLD supported tag then skip if ( strncmp ( pn->m_tagName,"masterIp",8 ) ) continue; if ( strncmp ( xn->m_tagName,"![CDATA[",8 ) ) continue; // otherwise append to buf char *text = xn->m_node + 9; int32_t tlen = xn->m_nodeLen - 12; // otherwise append to buf g_conf.m_connectIps.safeMemcpy(text,tlen); // a \n g_conf.m_connectIps.pushChar('\n'); g_conf.m_connectIps.nullTerm(); } /* // no! now we warn with a redbox alert // always make sure we got some admin security if ( g_conf.m_numMasterIps <= 0 && g_conf.m_numMasterPwds <= 0 ) { //log(LOG_INFO, // "conf: No master IP or password provided. Using default " // "password 'footbar23'." ); //g_conf.m_masterIps[0] = atoip ( "64.139.94.202", 13 ); //g_conf.m_numMasterIps = 1; strcpy ( g_conf.m_masterPwds[0] , "footbar23" ); g_conf.m_numMasterPwds = 1; } */ return true; } // returns false and sets g_errno on error bool Parms::setXmlFromFile(Xml *xml, char *filename, SafeBuf *sb ) { // File f; // f.set ( filename ); // is it too big? // int32_t fsize = f.getFileSize(); // if ( fsize > bufSize ) { // log ("conf: File size of %s is %"INT32", must be " // "less than %"INT32".",f.getFilename(),fsize,bufSize ); // char *xx = NULL; *xx = 0; // } // open it for reading // f.set ( filename ); // if ( ! f.open ( O_RDONLY ) ) // return log("conf: Could not open %s: %s.", // filename,mstrerror(g_errno)); // // read in the file // int32_t numRead = f.read ( buf , bufSize , 0 /*offset*/ ); // f.close ( ); // if ( numRead != fsize ) // return log ("conf: Could not read %s : %s.", // filename,mstrerror(g_errno)); // // null terminate it // buf [ fsize ] = '\0'; sb->load ( filename ); char *buf = sb->getBufStart(); if ( ! buf ) return log ("conf: Could not read %s : %s.", filename,mstrerror(g_errno)); // . remove all comments in case they contain tags // . if you have a # as part of your string, it must be html encoded, // just like you encode < and > char *s = buf; char *d = buf; while ( *s ) { // . skip comments // . watch out for html encoded pound signs though if ( *s == '#' ) { if (s>buf && *(s-1)=='&' && is_digit(*(s+1))) goto ok; while ( *s && *s != '\n' ) s++; continue; } // otherwise, transcribe over ok: *d++ = *s++; } *d = '\0'; int32_t bufSize = d - buf; // . set to xml // . use version of 0 return xml->set ( buf , bufSize , false , // ownData 0 , // allocSize false , // pureXml? 0 , // version true , // setParents 0 , // niceness CT_XML ); } //#define MAX_CONF_SIZE 200000 // returns false and sets g_errno on error bool Parms::saveToXml ( char *THIS , char *f , char objType ) { if ( g_conf.m_readOnlyMode ) return true; // print into buffer // "seeds" can be pretty big so go with safebuf now // fix so if we core in malloc/free we can still save conf char tmpbuf[200000]; SafeBuf sb(tmpbuf,200000); //char *p = buf; //char *pend = buf + MAX_CONF_SIZE; int32_t len ; //int32_t n ; File ff ; int32_t j ; int32_t count; char *s; CollectionRec *cr = NULL; if ( THIS != (char *)&g_conf ) cr = (CollectionRec *)THIS; // now set THIS based on the parameters in the xml file for ( int32_t i = 0 ; i < m_numParms ; i++ ) { // get it Parm *m = &m_parms[i]; if ( m->m_obj != objType ) continue; // . there are 2 object types, coll recs and g_conf, aka // OBJ_COLL and OBJ_CONF. // . make sure we got the right parms for what we want if ( m->m_obj == OBJ_NONE ) continue; // skip dups if ( m->m_flags & PF_DUP ) continue; // do not allow searchinput parms through if ( m->m_obj == OBJ_SI ) continue; if ( THIS == (char *)&g_conf && m->m_obj != OBJ_CONF) continue; if ( THIS != (char *)&g_conf && m->m_obj == OBJ_CONF) continue; if ( m->m_type == TYPE_MONOD2 ) continue; if ( m->m_type == TYPE_MONOM2 ) continue; if ( m->m_type == TYPE_CMD ) continue; if ( m->m_type == TYPE_BOOL2 ) continue; if ( m->m_type == TYPE_FILEUPLOADBUTTON ) continue; // ignore if hidden as well! no, have to keep those separate // since spiderroundnum/starttime is hidden but should be saved if ( m->m_flags & PF_NOSAVE ) continue; // ignore if diffbot and we are not a diffbot/custom crawl if ( cr && ! cr->m_isCustomCrawl && (m->m_flags & PF_DIFFBOT) ) continue; // skip if we should not save to xml if ( ! m->m_save ) continue; // allow comments though if ( m->m_type == TYPE_COMMENT ) goto skip2; // skip if this was compiled for a client and they should not // see this control //#ifdef _GLOBALSPEC_ // if ( m->m_priv == 2 ) continue; // if ( m->m_priv == 3 ) continue; //#elif _CLIENT_ // if ( m->m_priv ) continue; //#elif _METALINCS_ // if ( m->m_priv == 2 ) continue; // if ( m->m_priv == 3 ) continue; //#endif // skip if offset is negative, that means none s = (char *)THIS + m->m_off ; // if array, count can be 0 or more than 1 count = 1; if ( m->m_max > 1 ) count = *(int32_t *)(s-4); if ( m->m_fixed > 0 ) count = m->m_fixed; // sanity check if ( count > 100000 ) { log(LOG_LOGIC,"admin: Outrageous array size in for " "parameter %s. Does the array max size int32_t " "preceed it in the conf class?",m->m_title); exit(-1); } skip2: // description, do not wrap words around lines char *d = m->m_desc; // if empty array mod description to include the tag name char tmp [10*1024]; if ( m->m_max > 1 && count == 0 && gbstrlen(d) < 9000 && m->m_xml && m->m_xml[0] ) { char *cc = ""; if ( d && d[0] ) cc = "\n"; sprintf ( tmp , "%s%sUse <%s> tag.",d,cc,m->m_xml); d = tmp; } char *END = d + gbstrlen(d); char *dend; char *last; char *start; // just print tag if it has no description if ( ! *d ) goto skip; //if ( p + gbstrlen(d)+5 >= pend ) goto hadError; //if ( p > buf ) *p++='\n'; if ( sb.length() ) sb.pushChar('\n'); loop: dend = d + 77; if ( dend > END ) dend = END; last = d; start = d; while ( *d && d < dend ) { if ( *d == ' ' ) last = d; if ( *d == '\n' ) { last = d; break; } d++; } if ( ! *d ) last = d; //gbmemcpy ( p , "# " , 2 ); //p += 2; sb.safeMemcpy("# ",2); //gbmemcpy ( p , start , last - start ); //p += last - start; sb.safeMemcpy(start,last-start); //*p++='\n'; sb.pushChar('\n'); d = last + 1; if ( d < END && *d ) goto loop; // bail if comment if ( m->m_type == TYPE_COMMENT ) { //sprintf ( p , "\n" ); //p += gbstrlen ( p ); continue; } if ( m->m_type == TYPE_MONOD2 ) continue; if ( m->m_type == TYPE_MONOM2 ) continue; skip: /* . note: this code commented out because it was specific to an old client // if value is from default collection file, do not // explicitly list it if ( m->m_obj == OBJ_COLL && ((CollectionRec *)THIS)->m_orig[i] == 1 ) { sprintf ( p ,"# Value for <%s> tag taken from " "default.conf.\n",m->m_xml ); p += gbstrlen ( p ); continue; } */ // debug point //if ( m->m_type == TYPE_SAFEBUF ) // log("hey"); // loop over all in this potential array for ( j = 0 ; j < count ; j++ ) { // the xml //if ( p + gbstrlen(m->m_xml) >= pend ) goto hadError; if ( g_errno ) goto hadError; //sprintf ( p , "<%s>" , m->m_xml ); //p += gbstrlen ( p ); sb.safePrintf("<%s>" , m->m_xml ); // print CDATA if string if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_SAFEBUF || m->m_type == TYPE_STRINGNONEMPTY ) { //sprintf ( p , "m_xml , "filterRulesetDefault")==0) // log("got it"); // . represent it in ascii form // . this escapes out <'s and >'s // . this ALSO encodes #'s (xml comment indicators) //p = getParmHtmlEncoded(p,pend,m,s); getParmHtmlEncoded(&sb,m,s); // print CDATA if string if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_SAFEBUF || m->m_type == TYPE_STRINGNONEMPTY ) { //sprintf ( p , "]]>" ); //p += gbstrlen ( p ); sb.safeStrcpy("]]>" ); } // this is NULL if it ran out of room //if ( ! p ) goto hadError; if ( g_errno ) goto hadError; // advance to next element in array, if it is one s = s + m->m_size; // close the xml tag //if ( p + 4 >= pend ) goto hadError; //sprintf ( p , "\n" ); //p += gbstrlen ( p ); sb.safeStrcpy("\n" ); if ( g_errno ) goto hadError; } } //*p = '\0'; sb.nullTerm(); //ff.set ( f ); //if ( ! ff.open ( O_RDWR | O_CREAT | O_TRUNC ) ) // return log("db: Could not open %s : %s", // ff.getFilename(),mstrerror(g_errno)); // save the parm to the file //len = gbstrlen(buf); len = sb.length(); // use -1 for offset so we do not use pwrite() so it will not leave // garbage at end of file //n = ff.write ( buf , len , -1 ); //n = ff.write ( sb.getBufStart() , len , -1 ); //ff.close(); //if ( n == len ) return true; // save to filename "f". returns # of bytes written. -1 on error. if ( sb.safeSave ( f ) >= 0 ) return true; return log("admin: Could not write to file %s.",f); hadError: return log("admin: Error writing to %s: %s",f,mstrerror(g_errno)); //File bigger than %"INT32" bytes." // " Please increase #define in Parms.cpp.", // (int32_t)MAX_CONF_SIZE); } Parm *Parms::getParm ( char *cgi ) { for ( int32_t i = 0 ; i < m_numParms ; i++ ) { if ( ! m_parms[i].m_cgi ) continue ; if ( m_parms[i].m_cgi[0] != cgi[0] ) continue; if ( m_parms[i].m_cgi[1] != cgi[1] ) continue; if ( strcmp ( m_parms[i].m_cgi , cgi ) == 0 ) return &m_parms[i]; } return NULL; } /* Parm *Parms::getParm2 ( char *cgi , int32_t cgiLen ) { for ( int32_t i = 0 ; i < m_numParms ; i++ ) { if ( ! m_parms[i].m_cgi ) continue ; if ( m_parms[i].m_cgi[0] != cgi[0] ) continue; if ( cgiLen >=2 && m_parms[i].m_cgi[1] != cgi[1] ) continue; // only compare as many letters as the cgi name has if ( strncmp ( m_parms[i].m_cgi , cgi , cgiLen ) ) continue; // that means we gotta check lengths next if ( gbstrlen(m_parms[i].m_cgi) != cgiLen ) continue; // got a match return &m_parms[i]; } return NULL; } */ /* #define PHTABLE_SIZE (MAX_PARMS*2) Parm *Parms::getParm ( char *cgi ) { // make the hash table for the first call static int32_t s_phtable [ PHTABLE_SIZE ]; static Parm *s_phparm [ PHTABLE_SIZE ]; static bool s_init = false; // do not re-make the table if we already did if ( s_init ) goto skipMakeTable; // ok, now make the table s_init = true; memset ( s_phparm , 0 , PHTABLE_SIZE ); for ( int32_t i = 0 ; i < m_numParms ; i++ ) { if ( ! m_parms[i].m_cgi ) continue ; int32_t h = hash32 ( m_parms[i].m_cgi ); int32_t n = h % PHTABLE_SIZE; while ( s_phparm[n] ) { // . sanity check // . we don't have that many parms, they should never // collide!!... but it is possible i guess. if ( s_phtable[n] == h ) { log(LOG_LOGIC,"Parms: collisions forbidden in " "getParm(). Duplicate cgi name?"); char *xx = NULL; *xx = 0; } if (++n >= PHTABLE_SIZE) n = 0; } s_phtable[n] = h; // fill the bucket s_phparm [n] = m; // the parm } skipMakeTable: // look up in table int32_t h = hash32 ( cgi ); int32_t n = h % PHTABLE_SIZE; // while bucket is occupied and does not equal our hash... chain while ( s_phparm[n] && s_phtable[n] != h ) if (++n >= PHTABLE_SIZE) n = 0; // if empty, no match return s_phparm[n]; } */ bool Parms::getParmHtmlEncoded ( SafeBuf *sb , Parm *m , char *s ) { // do not breech the buffer //if ( p + 100 >= pend ) return p; // print it out char t = m->m_type; if ( t == TYPE_CHAR || t == TYPE_BOOL || t == TYPE_CHECKBOX || t == TYPE_PRIORITY || t == TYPE_PRIORITY2 || //t == TYPE_DIFFBOT_DROPDOWN || t == TYPE_UFP || t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES || t == TYPE_RETRIES || t == TYPE_FILTER || t == TYPE_BOOL2 || t == TYPE_CHAR2 ) sb->safePrintf("%"INT32"",(int32_t)*s); else if ( t == TYPE_FLOAT ) sb->safePrintf("%f",*(float *)s); else if ( t == TYPE_IP ) sb->safePrintf("%s",iptoa(*(int32_t *)s)); else if ( t == TYPE_LONG || t == TYPE_LONG_CONST || t == TYPE_RULESET|| t == TYPE_SITERULE ) sb->safePrintf("%"INT32"",*(int32_t *)s); else if ( t == TYPE_LONG_LONG ) sb->safePrintf("%"INT64"",*(int64_t *)s); else if ( t == TYPE_SAFEBUF ) { SafeBuf *sb2 = (SafeBuf *)s; char *buf = sb2->getBufStart(); //int32_t blen = 0; //if ( buf ) blen = gbstrlen(buf); //p = htmlEncode ( p , pend , buf , buf + blen , true ); // #?* // we can't do proper cdata and be backwards compatible //sb->cdataEncode ( buf );//, blen );//, true ); // #?* if ( buf ) sb->htmlEncode ( buf ); } else if ( t == TYPE_STRING || t == TYPE_STRINGBOX || t == TYPE_STRINGNONEMPTY || t == TYPE_TIME) { //int32_t slen = gbstrlen ( s ); // this returns the length of what was written, it may // not have converted everything if pend-p was too small... //p += saftenTags2 ( p , pend - p , s , len ); //p = htmlEncode ( p , pend , s , s + slen , true /*#?*/); // we can't do proper cdata and be backwards compatible //sb->cdataEncode ( s );//, slen );//, true /*#?*/); sb->htmlEncode ( s ); } else if ( t == TYPE_DATE || t == TYPE_DATE2 ) { // time is stored as int32_t int32_t ct = *(int32_t *)s; // get the time struct struct tm *tp = localtime ( (time_t *)&ct ) ; // set the "selected" month for the drop down char tmp[100]; strftime ( tmp , 100 , "%d %b %Y %H:%M UTC" , tp ); sb->safeStrcpy ( tmp ); sb->setLabel("parm3"); } //p += gbstrlen ( p ); //return p; return true; } /* // returns the size needed to serialize parms int32_t Parms::getStoredSize() { int32_t size = 0; // calling serialize with no ptr gets size serialize( NULL, &size ); return size; } // . serialize parms to buffer // . accepts addr of buffer ptr and addr of buffer size // . on entry buf can be NULL to determine required size // . if buf is not NULL, *bufSize must specify the size of buf // . on exit *buf is filled with serialized parms // . on exit *bufSize is set to the actual len of *buf bool Parms::serialize( char *buf, int32_t *bufSize ) { g_errno = 0; if ( ! bufSize ) { g_errno = EBADENGINEER; log( "admin: serialize: bad engineer: no bufSize ptr" ); *bufSize = 0; return false; } bool sizeChk = false; char *end = NULL; if ( ! buf ) sizeChk = true; // just calc size else end = buf + *bufSize; // for overrun checking // serialize OBJ_CONF and OBJ_COLL parms *bufSize = 0; char *p = buf; // now the parms struct SerParm *sp = NULL; for ( int32_t i = 0 ; i < m_numParms ; i++ ) { Parm *m = &m_parms[i]; // ignore these: if ( m->m_obj == OBJ_SI ) continue; if ( m->m_off < 0 ) continue; if ( m->m_type == TYPE_COMMENT ) continue; if ( m->m_type == TYPE_MONOD2 ) continue; if ( m->m_type == TYPE_MONOM2 ) continue; if ( m->m_type == TYPE_CMD ) continue; if ( m->m_type == TYPE_LONG_CONST ) continue; if ( ! m->m_sync ) continue; // parm is not to be synced // determine the size of the parm value int32_t size = 0; if ( m->m_type == TYPE_CHAR ) size = 1; if ( m->m_type == TYPE_CHAR2 ) size = 1; if ( m->m_type == TYPE_CHECKBOX ) size = 1; if ( m->m_type == TYPE_BOOL ) size = 1; if ( m->m_type == TYPE_BOOL2 ) size = 1; if ( m->m_type == TYPE_PRIORITY ) size = 1; if ( m->m_type == TYPE_PRIORITY2 ) size = 1; //if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1; if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1; if ( m->m_type == TYPE_RETRIES ) size = 1; if ( m->m_type == TYPE_TIME ) size = 6; if ( m->m_type == TYPE_DATE2 ) size = 4; if ( m->m_type == TYPE_DATE ) size = 4; if ( m->m_type == TYPE_FLOAT ) size = 4; if ( m->m_type == TYPE_IP ) size = 4; if ( m->m_type == TYPE_RULESET ) size = 4; if ( m->m_type == TYPE_LONG ) size = 4; if ( m->m_type == TYPE_LONG_LONG ) size = 8; if ( m->m_type == TYPE_STRING ) size = m->m_size; if ( m->m_type == TYPE_STRINGBOX ) size = m->m_size; if ( m->m_type == TYPE_STRINGNONEMPTY ) size = m->m_size; if ( m->m_type == TYPE_SAFEBUF ) size = m->m_size; if ( m->m_type == TYPE_SITERULE ) size = 4; // . set size to the total size of array // . set cnt to the number of itmes int32_t cnt = 1; if (m->m_fixed > 0) { size *= m->m_fixed; cnt = m->m_fixed; } else { size *= m->m_max; cnt = m->m_max; } if ( m->m_obj == OBJ_CONF ) { bool overflew = serializeConfParm( m, i, &p, end, size, cnt, sizeChk, bufSize ); if ( overflew ) goto overflow; } else if ( m->m_obj == OBJ_COLL ) { collnum_t j = g_collectiondb.getFirstCollnum (); while ( j >= 0 ) { CollectionRec *cr = g_collectiondb.getRec( j ); bool overflew = serializeCollParm( cr, m, i, &p, end, size, cnt, sizeChk, bufSize ); if ( overflew ) goto overflow; j = g_collectiondb.getNextCollnum ( j ); } } } if ( ! sizeChk ) { // set the final marker to 0s to indicate the end sp = (struct SerParm *)p; sp->i = 0; sp->obj = 0; sp->size = 0; sp->cnt = 0; } *bufSize += sizeof( struct SerParm ); return true; overflow: g_errno = EBADENGINEER; log(LOG_WARN, "admin: serialize: bad engineer: overflow" ); *bufSize = 0; return false; } // . serialize a conf parm // . if sizeChk is true then we do not serialize, but just get the // bytes required if we did serialize // . serialize parm into *p, the cursor i guess, buf end is "end" bool Parms::serializeConfParm( Parm *m, int32_t i, char **p, char *end, int32_t size, int32_t cnt, bool sizeChk, int32_t *bufSz ) { SerParm *sp = NULL; // safebuf not supported here yet, but it for coll recs below // so copy code from there if you need it if ( m->m_type == TYPE_SAFEBUF ) { char *xx=NULL;*xx=0;} if (m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_STRINGNONEMPTY ) { char *sVal = NULL; if ( ! sizeChk ) { sp = (SerParm *)*p; sp->i = i; // index of parm sp->obj = OBJ_CONF; sp->size = 0L; // 0 for strings sp->cnt = cnt; // # of strings // if an array, get num of member if ( cnt > 1 ) { sp->off = m->m_off - sizeof(int32_t); sp->num = *(int32_t *)((char *)&g_conf + sp->off); } else { sp->off = 0; sp->num = 0; } sVal = sp->val; } char *sConf = (char *)&g_conf + m->m_off; int32_t totLen = 0; int32_t tcnt = cnt; while ( tcnt ) { int32_t len = gbstrlen( sConf ); if ( ! sizeChk ) { // copy the parm value if ( sVal + len > end ) return true; // overflow strcpy( sVal, sConf ); } totLen += len + 1; // incl the NULL // inc conf ptr by size of strings sConf += m->m_size; // inc ser value by len of str + NULL sVal += len + 1; tcnt--; } if ( ! sizeChk ) { // inc by tot len of compacted strings *p += sizeof( *sp ) + totLen; } *bufSz += sizeof( SerParm ) + totLen; } else { if ( ! sizeChk ) { sp = (SerParm *)*p; sp->i = i; sp->obj = OBJ_CONF; sp->size = size; // tot size if array sp->cnt = cnt; // num of items // if array, get num of member if ( cnt > 1 ) { sp->off = m->m_off - sizeof(int32_t); sp->num = *(int32_t *)((char *)&g_conf + sp->off); } else { sp->off = 0; sp->num = 0; } // copy the parm's whole value if ( sp->val + size > end ) return true; // overflow gbmemcpy( sp->val, (char *)&g_conf + m->m_off, size ); // inc by tot size if array *p += sizeof( *sp ) + size; } *bufSz += sizeof( SerParm ) + size; } return false; } // . serialize a coll parm in CollectionRec.h // . if sizeChk is true then we do not serialize, but just get the // bytes required if we did serialize // . serialize parm into *p, the cursor i guess, buf end is "end" bool Parms::serializeCollParm( CollectionRec *cr, Parm *m, int32_t i, char **p, char *end, int32_t size, int32_t cnt, bool sizeChk, int32_t *bufSize) { SerParm *sp = NULL; if (m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_SAFEBUF || m->m_type == TYPE_STRINGNONEMPTY ) { char *sVal = NULL; if ( ! sizeChk ) { sp = (SerParm *)*p; sp->i = i; // index of parm sp->obj = OBJ_COLL; sp->size = 0L; // 0 for strings sp->cnt = cnt; // # of strings // is this parm an array if parms? if ( cnt > 1 ) { // the offset of the "count" or the // "number of elements" in the array. // it preceeds the value of the first element // as can be seen infor parms in // CollectionRec.h. sp->off = m->m_off - sizeof(int32_t); // store the # of then into "num" sp->num = *(int32_t *)((char *)cr + sp->off); } else { sp->off = 0; sp->num = 0; } sVal = sp->val; } // point to the actual parm itself char *sColl = (char *)cr + m->m_off; int32_t totLen = 0; // "cnt" is how many elements in the array int32_t tcnt = cnt; while ( tcnt ) { // the length of the string int32_t len; // the string char *pstr; // if a safebuf, point to string it has if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *sx = (SafeBuf *)sColl; pstr = sx->getBuf(); len = sx->length(); if ( ! pstr ) pstr = ""; } // get length of the string. if not a safebuf it will // just be an outright string in CollectionRec.h else { pstr = sColl; len = gbstrlen( sColl ); } if ( ! sizeChk ) { // copy the string if ( sVal+len > end ) { log("parms: buffer too small"); return true; } // this puts a \0 at the end strcpy( sVal, pstr ); } totLen += len + 1; // incl NULL // . inc cr ptr by size of strs // . this is the size of the SafeBuf for TYPE_SAFEBUF sColl += m->m_size; // . inc the write cursor by string length + the \0 sVal += len + 1; tcnt--; } if ( ! sizeChk ) { // inc by tot len of cmpctd str *p += sizeof( *sp ) + totLen; } *bufSize += sizeof( SerParm ) + totLen; } else { if ( ! sizeChk ) { sp = (SerParm *)*p; sp->i = i; sp->obj = OBJ_COLL; sp->size = size; // tot size sp->cnt = cnt; // num of items // get num of member if ( cnt > 1 ) { sp->off = m->m_off - sizeof(int32_t); sp->num = *(int32_t *)((char *)cr + sp->off); } else { sp->off = 0; sp->num = 0; } // copy whole value if ( sp->val + size > end ) return true; gbmemcpy( sp->val, (char *)cr + m->m_off, size ); // inc by whole size of value *p += sizeof( *sp ) + size; } *bufSize += sizeof( SerParm ) + size; } return false; } // deserialize parms from buffer and set our values to the new values void Parms::deserialize( char *buf ) { g_errno = 0; char *p = buf; bool confChgd = false; SerParm *sp = (SerParm *)p; int32_t numLooped = 0; const int32_t MAX_LOOP = (int32_t)(MAX_PARMS*1.5); // if one of these is non-zero, we're still working while ( (sp->obj || sp->size || sp->cnt) && (sp->obj > 0 && sp->size > 0 && sp->cnt > 0) && numLooped < MAX_LOOP ) { // grab the parm we're working on if ( sp->i < 0 || sp->i >= m_numParms ) { log( "admin: invalid parm # in Parms::deserialize" ); char *xx = NULL; *xx = 0; } Parm *m = &m_parms[ sp->i ]; if ( sp->obj == OBJ_CONF ) { deserializeConfParm( m, sp, &p, &confChgd ); sp = (struct SerParm *)p; } else if ( sp->obj == OBJ_COLL ) { collnum_t j = g_collectiondb.getFirstCollnum (); //if(j <= 0) { // log("coll: Collectiondb does not have a rec" ); // return; //} while ( j >= 0 ) { CollectionRec *cr = g_collectiondb.getRec( j ); deserializeCollParm( cr, m, sp, &p ); sp = (SerParm *)p; j = g_collectiondb.getNextCollnum ( j ); } } // setup the next rec sp = (SerParm *)p; numLooped++; } if (numLooped >= MAX_LOOP) { log( "admin: infinite loop in Parms::deserialize(). halting!"); char *xx = NULL; *xx = 0; } // if we changed the conf, we need to save it if ( confChgd ) { g_conf.save (); } // if we changed a CollectionRec, we need to save it int32_t j = g_collectiondb.getFirstCollnum (); while ( j >= 0 ) { CollectionRec *cr = g_collectiondb.getRec( j ); if ( cr->m_needsSave ) { cr->save (); // so g_spiderCache can reload if sameDomainWait, etc. // have changed g_collectiondb.updateTime(); } j = g_collectiondb.getNextCollnum ( j ); } } void Parms::deserializeConfParm( Parm *m, SerParm *sp, char **p, bool *confChgd ) { if ( m->m_off + sp->size > (int32_t)sizeof(g_conf) || m->m_off + sp->size < 0 ){ log(LOG_WARN, "admin: deserializing parm would overflow " "the collection rec!"); char *xx =0; *xx = 0; } if ( sp->size == 0 ) { // string char *sVal = sp->val; char *sConf = (char *)&g_conf + m->m_off; int32_t totLen = 0; bool goodParm = true; int32_t tcnt = sp->cnt; while ( tcnt ) { goodParm = (goodParm && 0 == strcmp( sVal, sConf )); int32_t len = gbstrlen( sVal ); totLen += len + 1; // inc ser value by len of str + NULL sVal += len + 1; // inc conf ptr by size of strings sConf += m->m_size; tcnt--; } if ( goodParm ) { // . inc by sizeof rec and tot len of compacted array *p += sizeof( *sp ) + totLen; return; } // parms don't match sVal = sp->val; sConf = (char *)&g_conf + m->m_off; totLen = 0; tcnt = sp->cnt; while ( tcnt ) { // copy an array value to this parm strcpy( sConf, sVal ); int32_t len = gbstrlen( sVal ); totLen += len + 1; // incl the NULL // inc conf ptr by size of strings sConf += m->m_size; // inc ser value by len of str + NULL sVal += len + 1; tcnt--; } // set num of member if ( sp->off ) { int32_t *tmp = (int32_t *)((char *)&g_conf + sp->off); *tmp = sp->num; } // log the changed parm log( LOG_INFO, "admin: Parm " "#%"INT32" \"%s\" (\"%s\") in conf " "changed on sync.", sp->i, m->m_cgi, m->m_title ); *confChgd = true; // inc by sizeof rec and tot len of compacted array *p += sizeof( *sp ) + totLen; } else { bool goodParm = ( 0 == memcmp( sp->val, (char *)&g_conf + m->m_off, sp->size ) ); if ( ! goodParm ) { // copy the new parm to m's loc gbmemcpy( (char *)&g_conf + m->m_off, sp->val, sp->size ); // set num of member if ( sp->off ) { int32_t *tmp = (int32_t *)((char *)&g_conf + sp->off); *tmp = sp->num; } // log the changed parm log( LOG_INFO, "admin: Parm " "#%"INT32" \"%s\" (\"%s\") in conf " "changed on sync.", sp->i, m->m_cgi, m->m_title ); *confChgd = true; } // increase by rec size and size of parm *p += sizeof( *sp ) + sp->size; } } void Parms::deserializeCollParm( CollectionRec *cr, Parm *m, SerParm *sp, char **p ) { if ( m->m_off + sp->size > (int32_t)sizeof(CollectionRec) || m->m_off + sp->size < 0 ) { log(LOG_WARN, "admin: deserializing parm would overflow " "the collection rec!"); char *xx =0; *xx = 0; } if ( sp->size == 0 ) { // strings char *sVal = sp->val; // the sent string buffer i guess char *sColl = (char *)cr + m->m_off; // what we have int32_t totLen = 0; int32_t tcnt = sp->cnt; // # of strings bool goodParm = true; while ( tcnt ) { char *pstr; if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *sx = (SafeBuf *)sColl; pstr = sx->getBuf(); } else { pstr = sColl; } // set goodParm to true if unchanged goodParm= (goodParm && 0 == strcmp(sVal, pstr)); // get length of what was sent to us int32_t len = gbstrlen( sVal ); totLen += len + 1; //incl NULL // this is a list of strings with \0s (sent to us) sVal += len + 1; //incl NULL // inc by size of strs. point to next string we have // stored in our array of strings in CollectionRec. // for TYPE_SAFEBUF this size is sizeof(SafeBuf). sColl += m->m_size; tcnt--; } // if parm was an exact match return now if ( goodParm ) { // . inc by sizeof rec and // tot len of compacted array // . skip the SerParm and following string buffer. *p += sizeof( *sp ) + totLen; return; } // // if parms don't match, we need to update our stuff // // // point to the sent string buffer sVal = sp->val; // point to the local parm, array of strings or safebufs sColl = (char *)cr + m->m_off; totLen = 0; // how many strings or safebufs in there? tcnt = sp->cnt; // loop over each one while ( tcnt ) { if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *sx = (SafeBuf *)sColl; sx->set ( sVal ); sx->nullTerm ( ); } else { // copy an array value to this parm strcpy( sColl, sVal ); } // get length of string we copied int32_t len = gbstrlen( sVal ); totLen += len + 1; // +the NULL // . inc conf ptr by size // of strings sColl += m->m_size; // . inc ser value by len of str + NULL sVal += len + 1; tcnt--; } // we changed the record cr->m_needsSave = true; // set num of member if ( sp->off ) { int32_t *tmp = (int32_t *)((char *)cr + sp->off); *tmp = sp->num; } // log the changed parm log( LOG_INFO, "admin: Parm " "#%"INT32" \"%s\" (\"%s\") in " "collection \"%s\" " "changed on sync.", sp->i, m->m_cgi, m->m_title, cr->m_coll ); // . inc by sizeof rec and // tot len of compacted array *p += sizeof( *sp ) + totLen; } else { // sanity if ( m->m_type == TYPE_SAFEBUF ) { char *xx=NULL;*xx=0; } if ( 0 != memcmp( sp->val, (char *)cr + m->m_off, sp->size) ) { // copy the new value gbmemcpy( (char *)cr + m->m_off, sp->val, sp->size ); // set num of member if ( sp->off ) { int32_t *tmp = (int32_t *)((char *)cr + sp->off); *tmp = sp->num; } // log the changed parm log( LOG_INFO, "admin: Parm " "#%"INT32" \"%s\" (\"%s\") " "in collection \"%s\" " "changed on sync.", sp->i, m->m_cgi, m->m_title, cr->m_coll ); // we changed the record cr->m_needsSave = true; } // inc by rec size and tot len of array *p += sizeof( *sp ) + sp->size; } } */ void Parms::init ( ) { // initialize the Parms class if we need to, only do it once static bool s_init = false ; if ( s_init ) return; s_init = true ; // default all for ( int32_t i = 0 ; i < MAX_PARMS ; i++ ) { m_parms[i].m_parmNum= i; m_parms[i].m_hash = 0 ; m_parms[i].m_title = "" ; // for detecting if not set m_parms[i].m_desc = "" ; // for detecting if not set m_parms[i].m_cgi = NULL ; // for detecting if not set m_parms[i].m_off = -1 ; // for detecting if not set // for PAGE_FILTERS url filters for printing the url // filter profile parm above the url filters table rows. m_parms[i].m_colspan= -1; m_parms[i].m_def = NULL ; // for detecting if not set m_parms[i].m_defOff = -1; // if default pts to collrec parm m_parms[i].m_type = TYPE_NONE ; // for detecting if not set m_parms[i].m_page = -1 ; // for detecting if not set m_parms[i].m_obj = -1 ; // for detecting if not set m_parms[i].m_max = 1 ; // max elements in array m_parms[i].m_fixed = 0 ; // size of fixed size array m_parms[i].m_size = 0 ; // max string size m_parms[i].m_cast = 1 ; // send to all hosts? m_parms[i].m_rowid = -1 ; // rowid of -1 means not in row m_parms[i].m_addin = 0 ; // add insert row command? m_parms[i].m_rdonly = 0 ; // is command off in read-only mode? m_parms[i].m_hdrs = 1 ; // assume to always print headers m_parms[i].m_perms = 0 ; // same as containing WebPages perms m_parms[i].m_plen = -1 ; // offset for strings length m_parms[i].m_group = 1 ; // start of a new group of controls? m_parms[i].m_priv = 0 ; // is it private? m_parms[i].m_save = 1 ; // save to xml file? m_parms[i].m_min = -1 ; // min value (for int32_t parms) // search fields //m_parms[i].m_sparm = 0; //m_parms[i].m_scmd = NULL;//"/search"; //m_parms[i].m_scgi = NULL;// defaults to m_cgi m_parms[i].m_flags = 0; m_parms[i].m_icon = NULL; m_parms[i].m_class = NULL; m_parms[i].m_qterm = NULL; m_parms[i].m_subMenu= 0; m_parms[i].m_spriv = 0; // m_sdefo = -1; // just use m_off for this! m_parms[i].m_sminc = -1; // min in collection rec m_parms[i].m_smaxc = -1; // max in collection rec m_parms[i].m_smin = 0x80000000; // 0xffffffff; m_parms[i].m_smax = 0x7fffffff; //m_parms[i].m_soff = -1; // offset into SearchInput m_parms[i].m_sprpg = 1; // propagate to other pages via GET m_parms[i].m_sprpp = 1; // propagate to other pages via POST m_parms[i].m_sync = true; } // inherit perms from page //for ( int32_t i = 1 ; i < MAX_PARMS ; i++ ) // if ( m_parms[i].m_page ) // m_parms[i].m_perms = m_parms[i-1].m_perms; Parm *m = &m_parms [ 0 ]; CollectionRec cr; SearchInput si; /////////////////////////////////////////// // CAN ONLY BE CHANGED IN CONF AT STARTUP (no cgi field) /////////////////////////////////////////// char *g = (char *)&g_conf; char *x = (char *)&cr; char *y = (char *)&si; ////////////// // // now for Pages.cpp printApiForPage() we need these // ////////////// GigablastRequest gr; InjectionRequest ir; /* m->m_title = "delete collection"; m->m_desc = "A collection name to delete. You can specify multiple " "&delColl= parms in the request to delete multiple " "collections."; m->m_cgi = "delColl"; m->m_page = PAGE_DELCOLL; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = 0;//PF_API | PF_REQUIRED; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "delete collection"; m->m_desc = "A collection name to delete. You can specify multiple " "&delColl= parms in the request to delete multiple " "collections."; // camelcase as opposed to above lowercase m->m_cgi = "delcoll"; m->m_page = PAGE_DELCOLL; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = PF_API | PF_REQUIRED; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "add collection"; m->m_desc = "A collection name to add."; // camelcase support m->m_cgi = "addColl"; m->m_page = PAGE_ADDCOLL; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = PF_API | PF_REQUIRED; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "add collection"; m->m_desc = "A collection name to add."; // lowercase support m->m_cgi = "addcoll"; m->m_page = PAGE_ADDCOLL; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = PF_HIDDEN; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; */ m->m_title = "collection"; m->m_desc = "Clone settings INTO this collection."; m->m_cgi = "c"; m->m_page = PAGE_CLONECOLL; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = PF_API | PF_REQUIRED; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "collection"; m->m_desc = "Use this collection."; m->m_cgi = "c"; m->m_page = PAGE_BASIC_STATUS; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = PF_API | PF_REQUIRED; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "collection"; m->m_desc = "Use this collection."; m->m_cgi = "c"; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; // do not show in html controls m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "collection"; m->m_desc = "Use this collection."; m->m_cgi = "c"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; // do not show in html controls m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "collection"; m->m_desc = "Use this collection."; m->m_cgi = "c"; m->m_page = PAGE_SPIDERDB; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; // do not show in html controls m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "collection"; m->m_desc = "Use this collection."; m->m_cgi = "c"; m->m_page = PAGE_SITEDB; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; // do not show in html controls m->m_flags = PF_API | PF_REQUIRED | PF_NOHTML; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "collection"; m->m_desc = "Inject into this collection."; m->m_cgi = "c"; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR; m->m_def = NULL; // PF_COLLDEFAULT: so it gets set to default coll on html page m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML; m->m_page = PAGE_INJECT; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; // // // // more global-ish parms // // // m->m_title = "show settings"; // m->m_desc = "show settings or values for this page."; // m->m_cgi = "showsettings"; // m->m_page = PAGE_MASTER; // m->m_obj = OBJ_NONE; // m->m_type = TYPE_BOOL; // m->m_def = "1"; // // do not show in html controls // m->m_flags = PF_API | PF_NOHTML; // m->m_off = (char *)&gr.m_coll - (char *)&gr; // m++; //////////// // // end stuff for printApiForPage() // //////////// // just a comment in the conf file m->m_desc = "All <, >, \" and # characters that are values for a field " "contained herein must be represented as " "<, >, " and # respectively."; m->m_type = TYPE_COMMENT; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; // if the next guy has no description (m_desc) he is assumed to // share the description of the previous parm with one. /* m->m_title = "main external ip"; m->m_desc = "This is the IP and port that a user connects to in " "order to search this Gigablast network. This should be the " "same for all gb processes."; m->m_off = (char *)&g_conf.m_mainExternalIp - g; m->m_def = "127.0.0.1"; // if no default, it is required! m->m_type = TYPE_IP; m++; m->m_title = "main external port"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_mainExternalPort - g; m->m_def = "80"; m->m_type = TYPE_LONG; m++; */ /* m->m_title = "indexdb split"; m->m_desc = "Number of times to split indexdb across groups. " "Must be a power of 2."; m->m_off = (char *)&g_hostdb.m_indexSplits - g; // -1 means to do a full split just based on docid, just like titledb m->m_def = "-1"; // "1"; m->m_type = TYPE_LONG; m++; m->m_title = "full indexdb split"; m->m_desc = "Set to 1 (true) if indexdb is fully split. Performance " "is much better for fully split indexes."; m->m_off = (char *)&g_conf.m_fullSplit - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m++; m->m_title = "legacy indexdb split"; m->m_desc = "Set to 1 (true) if using legacy indexdb splitting. For " "data generated with farmington release."; m->m_off = (char *)&g_conf.m_legacyIndexdbSplit - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m++; m->m_title = "tfndb extension bits"; m->m_desc = "Number of extension bits to use in Tfndb. Increased for " "large indexes."; m->m_off = (char *)&g_conf.m_tfndbExtBits - g; m->m_def = "7"; m->m_type = TYPE_LONG; m++; */ /* m->m_title = "checksumdb key size"; m->m_desc = "This determines the key size for checksums. " "Must be set for every host."; //m->m_cgi = ""; m->m_off = (char *)&g_conf.m_checksumdbKeySize - g; m->m_type = TYPE_LONG; m->m_def = "12"; m++; */ // just a comment in the conf file m->m_desc = "Below the various Gigablast databases are configured.\n" "<*dbMaxTreeMem> - mem used for holding new recs\n" "<*dbMaxDiskPageCacheMem> - disk page cache mem for this db\n" "<*dbMaxCacheMem> - cache mem for holding single recs\n" //"<*dbMinFilesToMerge> - required # files to trigger merge\n" "<*dbSaveCache> - save the rec cache on exit?\n" "<*dbMaxCacheAge> - max age (seconds) for recs in rec cache\n" "See that Stats page for record counts and stats.\n"; m->m_type = TYPE_COMMENT; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; m->m_title = "dns max cache mem"; m->m_desc = "How many bytes should be used for caching DNS replies?"; m->m_off = (char *)&g_conf.m_dnsMaxCacheMem - g; m->m_def = "128000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; // g_dnsDistributed always saves now. main.cpp inits it that way. //m->m_title = "dns save cache"; //m->m_desc = "Should the DNS reply cache be saved/loaded on " // "exit/startup?"; //m->m_off = (char *)&g_conf.m_dnsSaveCache - g; //m->m_def = "0"; //m->m_type = TYPE_BOOL; //m++; m->m_title = "tagdb max tree mem"; m->m_desc = "A tagdb record " "assigns a url or site to a ruleset. Each tagdb record is " "about 100 bytes or so."; m->m_off = (char *)&g_conf.m_tagdbMaxTreeMem - g; m->m_def = "1028000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; // m->m_title = "tagdb max page cache mem"; // m->m_desc = ""; // m->m_off = (char *)&g_conf.m_tagdbMaxDiskPageCacheMem - g; // m->m_def = "200000"; // m->m_type = TYPE_LONG; // m->m_flags = PF_NOSYNC|PF_NOAPI; // m->m_page = PAGE_NONE; // m->m_obj = OBJ_CONF; // m++; //m->m_title = "tagdb max cache mem"; //m->m_desc = ""; //m->m_off = (char *)&g_conf.m_tagdbMaxCacheMem - g; //m->m_def = "128000"; //m->m_type = TYPE_LONG; //m++; //m->m_title = "tagdb min files to merge"; //m->m_desc = ""; //m->m_off = (char *)&g_conf.m_tagdbMinFilesToMerge - g; //m->m_def = "2"; //m->m_type = TYPE_LONG; //m->m_save = 0; //m++; m->m_title = "catdb max tree mem"; m->m_desc = "A catdb record " "assigns a url or site to DMOZ categories. Each catdb record " "is about 100 bytes."; m->m_off = (char *)&g_conf.m_catdbMaxTreeMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; // m->m_title = "catdb max page cache mem"; // m->m_desc = ""; // m->m_off = (char *)&g_conf.m_catdbMaxDiskPageCacheMem - g; // m->m_def = "25000000"; // m->m_type = TYPE_LONG; // m->m_flags = PF_NOSYNC|PF_NOAPI; // m->m_page = PAGE_NONE; // m->m_obj = OBJ_CONF; // m++; m->m_title = "catdb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_catdbMaxCacheMem - g; m->m_def = "0"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; /* m->m_title = "catdb min files to merge"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_catdbMinFilesToMerge - g; m->m_def = "2"; m->m_type = TYPE_LONG; m->m_save = 0; m++; m->m_title = "revdb max tree mem"; m->m_desc = "Revdb holds the meta list we added for this doc."; m->m_off = (char *)&g_conf.m_revdbMaxTreeMem - g; m->m_def = "30000000"; m->m_type = TYPE_LONG; m++; */ /* m->m_title = "timedb max tree mem"; m->m_desc = "Timedb holds event time intervals"; m->m_off = (char *)&g_conf.m_timedbMaxTreeMem - g; m->m_def = "30000000"; m->m_type = TYPE_LONG; m++; */ /* m->m_title = "titledb max tree mem"; m->m_desc = "Titledb holds the compressed documents that have been " "indexed."; m->m_off = (char *)&g_conf.m_titledbMaxTreeMem - g; m->m_def = "10000000"; m->m_type = TYPE_LONG; m++; m->m_title = "titledb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_titledbMaxCacheMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m++; m->m_title = "titledb max cache age"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_titledbMaxCacheAge - g; m->m_def = "86400"; // 1 day m->m_type = TYPE_LONG; m++; m->m_title = "titledb save cache"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_titledbSaveCache - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m++; */ m->m_title = "clusterdb max tree mem"; m->m_desc = "Clusterdb caches small records for site clustering " "and deduping."; m->m_off = (char *)&g_conf.m_clusterdbMaxTreeMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; /* m->m_title = "clusterdb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_clusterdbMaxCacheMem - g; m->m_def = "100000000"; m->m_type = TYPE_LONG; m++; m->m_title = "clusterdb max page cache mem"; m->m_desc = ""; m->m_off =(char *)&g_conf.m_clusterdbMaxDiskPageCacheMem - g; m->m_def = "100000000"; m->m_type = TYPE_LONG; m++; */ // this is overridden by collection m->m_title = "clusterdb min files to merge"; m->m_desc = ""; m->m_cgi = "cmftm"; m->m_off = (char *)&g_conf.m_clusterdbMinFilesToMerge - g; //m->m_def = "2"; m->m_def = "-1"; // -1 means to use collection rec m->m_type = TYPE_LONG; m->m_save = 0; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m->m_flags = PF_NOAPI; m++; m->m_title = "clusterdb save cache"; m->m_desc = ""; m->m_cgi = "cdbsc"; m->m_off = (char *)&g_conf.m_clusterdbSaveCache - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m->m_flags = PF_NOAPI; m++; m->m_title = "max vector cache mem"; m->m_desc = "Max memory for dup vector cache."; m->m_off = (char *)&g_conf.m_maxVectorCacheMem - g; m->m_def = "10000000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; /* m->m_title = "checksumdb max tree mem"; m->m_desc = "Checksumdb is used for deduping same-site urls at " "index time."; m->m_off = (char *)&g_conf.m_checksumdbMaxTreeMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m++; m->m_title = "checksumdb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_checksumdbMaxCacheMem - g; m->m_def = "2000000"; m->m_type = TYPE_LONG; m++; m->m_title = "checksumdb max page cache mem"; m->m_desc = ""; m->m_off =(char *)&g_conf.m_checksumdbMaxDiskPageCacheMem-g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m++; // this is overridden by collection m->m_title = "checksumdb min files to merge"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_checksumdbMinFilesToMerge- g; //m->m_def = "2"; m->m_def = "-1"; // -1 means to use collection rec m->m_type = TYPE_LONG; m->m_save = 0; m++; */ /* m->m_title = "tfndb max tree mem"; m->m_desc = "Tfndb holds small records for each url in Spiderdb or " "Titledb."; m->m_off = (char *)&g_conf.m_tfndbMaxTreeMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m++; m->m_title = "tfndb max page cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_tfndbMaxDiskPageCacheMem - g; m->m_def = "5000000"; m->m_type = TYPE_LONG; m++; */ /* // this is overridden by collection m->m_title = "tfndb min files to merge"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_tfndbMinFilesToMerge - g; m->m_def = "2"; m->m_type = TYPE_LONG; m->m_save = 0; m++; */ /* m->m_title = "spiderdb max tree mem"; m->m_desc = "Spiderdb holds urls to be spidered."; m->m_off = (char *)&g_conf.m_spiderdbMaxTreeMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m++; m->m_title = "spiderdb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_spiderdbMaxCacheMem - g; m->m_def = "0"; m->m_type = TYPE_LONG; m++; m->m_title = "spiderdb max page cache mem"; m->m_desc = ""; m->m_off =(char *)&g_conf.m_spiderdbMaxDiskPageCacheMem-g; m->m_def = "500000"; m->m_type = TYPE_LONG; m++; // this is overridden by collection m->m_title = "spiderdb min files to merge"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_spiderdbMinFilesToMerge - g; //m->m_def = "2"; m->m_def = "-1"; // -1 means to use collection rec m->m_type = TYPE_LONG; m->m_save = 0; m++; */ m->m_title = "robotdb max cache mem"; m->m_desc = "Robotdb caches robot.txt files."; m->m_off = (char *)&g_conf.m_robotdbMaxCacheMem - g; m->m_def = "128000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; m->m_title = "robotdb save cache"; m->m_cgi = "rdbsc"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_robotdbSaveCache - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m->m_flags = PF_NOAPI; m++; /* m->m_title = "indexdb max tree mem"; m->m_desc = "Indexdb holds the terms extracted from spidered " "documents."; m->m_off = (char *)&g_conf.m_indexdbMaxTreeMem - g; m->m_def = "10000000"; m->m_type = TYPE_LONG; m++; m->m_title = "indexdb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_indexdbMaxCacheMem - g; m->m_def = "5000000"; m->m_type = TYPE_LONG; m++; m->m_title = "indexdb max page cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_indexdbMaxDiskPageCacheMem - g; m->m_def = "50000000"; m->m_type = TYPE_LONG; m++; */ // m->m_title = "linkdb max page cache mem"; // m->m_desc = ""; // m->m_off = (char *)&g_conf.m_linkdbMaxDiskPageCacheMem - g; // m->m_def = "0"; // m->m_type = TYPE_LONG; // m->m_flags = PF_NOSYNC|PF_NOAPI; // m->m_page = PAGE_NONE; // m->m_obj = OBJ_CONF; // m++; /* // this is overridden by collection m->m_title = "indexdb min files to merge"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_indexdbMinFilesToMerge - g; //m->m_def = "6"; m->m_def = "-1"; // -1 means to use collection rec m->m_type = TYPE_LONG; m->m_save = 0; m++; m->m_title = "indexdb max index list age"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_indexdbMaxIndexListAge - g; m->m_def = "60"; m->m_type = TYPE_LONG; m++; //m->m_title = "indexdb truncation limit"; //m->m_desc = ""; //m->m_off = (char *)&g_conf.m_indexdbTruncationLimit - g; //m->m_def = "50000000"; //m->m_type = TYPE_LONG; //m++; m->m_title = "indexdb save cache"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_indexdbSaveCache - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m++; */ /* m->m_title = "datedb max tree mem"; m->m_desc = "Datedb holds the terms extracted from spidered " "documents."; m->m_off = (char *)&g_conf.m_datedbMaxTreeMem - g; m->m_def = "10000000"; m->m_type = TYPE_LONG; m++; m->m_title = "datedb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_datedbMaxCacheMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m++; // this is overridden by collection m->m_title = "datedb min files to merge"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_datedbMinFilesToMerge - g; //m->m_def = "8"; m->m_def = "-1"; // -1 means to use collection rec m->m_type = TYPE_LONG; m->m_save = 0; m++; m->m_title = "datedb max index list age"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_datedbMaxIndexListAge - g; m->m_def = "60"; m->m_type = TYPE_LONG; m++; m->m_title = "datedb save cache"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_datedbSaveCache - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m++; */ /* m->m_title = "linkdb max tree mem"; m->m_desc = "Linkdb stores linking information"; m->m_off = (char *)&g_conf.m_linkdbMaxTreeMem - g; m->m_def = "20000000"; m->m_type = TYPE_LONG; m++; // this is overridden by collection m->m_title = "linkdb min files to merge"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_linkdbMinFilesToMerge - g; m->m_def = "-1"; // -1 means to use collection rec m->m_type = TYPE_LONG; //m->m_save = 0; m++; */ /* m->m_title = "quota table max mem"; m->m_desc = "For caching and keeping tabs on exact quotas per " "domain without having to do a disk seek. If you are using " "exact quotas and see a lot of disk seeks on Indexdb, try " "increasing this."; m->m_off = (char *)&g_conf.m_quotaTableMaxMem - g; m->m_def = "1000000"; m->m_type = TYPE_LONG; m++; */ m->m_title = "statsdb max tree mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_statsdbMaxTreeMem - g; m->m_def = "5000000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; m->m_title = "statsdb max cache mem"; m->m_desc = ""; m->m_off = (char *)&g_conf.m_statsdbMaxCacheMem - g; m->m_def = "0"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; // m->m_title = "statsdb max disk page cache mem"; // m->m_desc = ""; // m->m_off = (char *)&g_conf.m_statsdbMaxDiskPageCacheMem - g; // m->m_def = "1000000"; // m->m_type = TYPE_LONG; // m->m_flags = PF_NOSYNC|PF_NOAPI; // m->m_page = PAGE_NONE; // m->m_obj = OBJ_CONF; // m++; //m->m_title = "statsdb min files to merge"; //m->m_desc = ""; //m->m_off = (char *)&g_conf.m_statsdbMinFilesToMerge - g; //m->m_def = "5"; //m->m_type = TYPE_LONG; //m++; /* m->m_title = "use buckets for in memory recs"; m->m_desc = "Use buckets for in memory recs for indexdb, datedb, " "and linkdb."; m->m_off = (char *)&g_conf.m_useBuckets - g; m->m_def = "1"; m->m_type = TYPE_BOOL; m++; */ m->m_title = "http max send buf size"; m->m_desc = "Maximum bytes of a doc that can be sent before having " "to read more from disk"; m->m_cgi = "hmsbs"; m->m_off = (char *)&g_conf.m_httpMaxSendBufSize - g; m->m_def = "128000"; m->m_type = TYPE_LONG; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m->m_flags = PF_NOAPI; m++; m->m_title = "search results max cache mem"; m->m_desc = "Bytes to use for caching search result pages."; m->m_off = (char *)&g_conf.m_searchResultsMaxCacheMem - g; m->m_def = "100000"; m->m_type = TYPE_LONG; m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; //m->m_title = "search results max cache age"; //m->m_desc = "Maximum age to cache search results page in seconds."; //m->m_off = (char *)&g_conf.m_searchResultsMaxCacheAge - g; //m->m_def = "86400"; //m->m_type = TYPE_LONG; //m++; //m->m_title = "search results save cache"; //m->m_desc = "Should the search results cache be saved to disk?"; //m->m_off = (char *)&g_conf.m_searchResultsSaveCache - g; //m->m_def = "0"; //m->m_type = TYPE_BOOL; //m++; //m->m_title = "site link info max cache mem"; //m->m_desc = "Bytes to use for site link info data."; //m->m_off = (char *)&g_conf.m_siteLinkInfoMaxCacheMem - g; //m->m_def = "100000"; //m->m_type = TYPE_LONG; //m++; //m->m_title = "site link info max cache age"; //m->m_desc = "Maximum age to cache site link info data in seconds."; //m->m_off = (char *)&g_conf.m_siteLinkInfoMaxCacheAge - g; //m->m_def = "3600"; //m->m_type = TYPE_LONG; //m++; //m->m_title = "site link info save cache"; //m->m_desc = "Should the site link info cache be saved to disk?"; //m->m_off = (char *)&g_conf.m_siteLinkInfoSaveCache - g; //m->m_def = "0"; //m->m_type = TYPE_BOOL; //m++; //m->m_title = "site quality max cache mem"; //m->m_desc = "Bytes to use for site or root page quality."; //m->m_off = (char *)&g_conf.m_siteQualityMaxCacheMem - g; //m->m_def = "2000000"; // 2MB //m->m_type = TYPE_LONG; //m++; //m->m_title = "site quality save cache"; //m->m_desc = "Should the site link info cache be saved to disk?"; //m->m_off = (char *)&g_conf.m_siteQualitySaveCache - g; //m->m_def = "0"; //m->m_type = TYPE_BOOL; //m++; //m->m_title = "max incoming links to sample"; //m->m_desc = "Max linkers to a doc that are sampled to determine " // "quality and for gathering link text."; //m->m_off = (char *)&g_conf.m_maxIncomingLinksToSample - g; //m->m_def = "100"; //m->m_type = TYPE_LONG; //m++; //m->m_title = "allow async signals"; //m->m_desc = "Allow software interrupts?"; //m->m_off = (char *)&g_conf.m_allowAsyncSignals - g; //m->m_def = "1"; //m->m_type = TYPE_BOOL; //m++; /* m->m_title = "qa build mode"; m->m_desc = "When on Msg13.cpp saves docs in the qatest123 coll " "to qa/ subdir, when off " "if downloading a doc for qatest123 coll and not in " "qa subdir then it returns a 404."; m->m_cgi = "qabuildmode"; m->m_off = (char *)&g_conf.m_qaBuildMode - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m->m_flags = PF_NOAPI | PF_HIDDEN; m++; */ m->m_title = "read only mode"; m->m_desc = "Read only mode does not allow spidering."; m->m_cgi = "readonlymode"; m->m_off = (char *)&g_conf.m_readOnlyMode - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m->m_flags = PF_NOAPI; m++; /* Disable this until it works. m->m_title = "use merge token"; m->m_desc = "Restrict merging to one host per token group? Hosts " "that use the same disk and mirror hosts are generally in the " "same token group so that only one host in the group can be " "doing a merge at a time. This prevents query response time " "from suffering too much."; m->m_off = (char *)&g_conf.m_useMergeToken - g; m->m_def = "1"; m->m_type = TYPE_BOOL; m++; */ /* m->m_title = "do spell checking"; m->m_desc = "Spell check using the dictionary. Will be available " "again soon."; m->m_off = (char *)&g_conf.m_doSpellChecking - g; m->m_cgi = "dospellchecking"; m->m_def = "1"; m->m_type = TYPE_BOOL; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_CONF; m++; */ m->m_title = "do narrow search"; m->m_desc = "give narrow search suggestions."; m->m_off = (char *)&g_conf.m_doNarrowSearch - g; m->m_cgi = "donarrowsearch"; m->m_def = "0"; m->m_type = TYPE_BOOL; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_CONF; m++; /////////////////////////////////////////// // BASIC SETTINGS /////////////////////////////////////////// m->m_title = "spidering enabled"; m->m_desc = "Pause and resumes spidering for this collection."; m->m_cgi = "bcse"; m->m_off = (char *)&cr.m_spideringEnabled - x; m->m_page = PAGE_BASIC_SETTINGS; m->m_obj = OBJ_COLL; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_DUP|PF_CLONE; m++; m->m_title = "site list"; m->m_xml = "siteList"; m->m_desc = "List of sites to spider, one per line. " "See example site list below. " "
" "
" "Example #1: mysite.com myothersite.com" "
" "This will spider just those two sites." "
" "
" "Example #2: seed:dmoz.org" "
" "This will spider the whole web starting with the website " "dmoz.org" "

" "Gigablast uses the " "insitelist " "directive on " "the url filters " "page to make sure that the spider only indexes urls " "that match the site patterns you specify here, other than " "urls you add individually via the add urls or inject url " "tools. " "Limit list to 300MB. If you have a lot of INDIVIDUAL urls " "to add then consider using the add " "urls interface."; m->m_cgi = "sitelist"; m->m_off = (char *)&cr.m_siteListBuf - x; m->m_page = PAGE_BASIC_SETTINGS; m->m_obj = OBJ_COLL; m->m_type = TYPE_SAFEBUF; m->m_func = CommandUpdateSiteList; m->m_def = ""; // rebuild urlfilters now will nuke doledb and call updateSiteList() m->m_flags = PF_TEXTAREA | PF_DUP | PF_REBUILDURLFILTERS; m++; /* m->m_title = "spider sites"; m->m_desc = "Attempt to spider and index urls in the " "\"site patterns\" above. Saves you from having to add " "the same list of sites on the " "add url page."; m->m_cgi = "spiderToo"; m->m_off = (char *)&cr.m_spiderToo - x; m->m_page = PAGE_BASIC_SETTINGS; m->m_obj = OBJ_COLL; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_flags = PF_NOSAVE | PF_DUP; m++; */ /* // the new upload post submit button m->m_title = "upload site list"; m->m_desc = "Upload your file of site patterns. Completely replaces " "the site list in the text box above."; m->m_cgi = "uploadsitelist"; m->m_page = PAGE_BASIC_SETTINGS; m->m_obj = OBJ_COLL; m->m_off = 0; m->m_def = NULL; m->m_type = TYPE_FILEUPLOADBUTTON; m->m_flags = PF_NOSAVE | PF_DUP; m++; */ m->m_title = "restart collection"; m->m_desc = "Remove all documents from the collection and re-add " "seed urls from site list."; // If you do this accidentally there " //"is a recovery procedure to " // "get back the trashed data."; m->m_cgi = "restart"; m->m_page = PAGE_BASIC_SETTINGS; m->m_obj = OBJ_COLL; m->m_type = TYPE_CMD; m->m_func2 = CommandRestartColl; m++; /////////////////////////////////////////// // SITE LIST /////////////////////////////////////////// /* m->m_title = "spider sites"; m->m_desc = "Attempt to spider and index urls in the " "\"site patterns\" above. Saves you from having to add " "the same list of sites on the " "add url page."; m->m_cgi = "spiderToo"; m->m_off = (char *)&cr.m_spiderToo - x; m->m_page = PAGE_SITES; m->m_obj = OBJ_COLL; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_flags = PF_NOSAVE ; m++; */ /////////////////////////////////////////// // SYNC CONTROLS /////////////////////////////////////////// /* m->m_title = "sync enabled"; m->m_desc = "Turn data synchronization on or off. When a host comes " "up he will perform an incremental synchronization with a " "twin if he detects that he was unable to save his data " "when he last exited."; m->m_cgi = "sye"; m->m_off = (char *)&g_conf.m_syncEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SYNC; m++; m->m_title = "dry run"; m->m_desc = "Should Gigablast just run through and log the changes " "it would make without actually making them?"; m->m_cgi = "sdr"; m->m_off = (char *)&g_conf.m_syncDryRun - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "sync indexdb"; m->m_desc = "Turn data synchronization on or off for indexdb. " "Indexdb holds the index information."; m->m_cgi = "si"; m->m_off = (char *)&g_conf.m_syncIndexdb - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m++; m->m_title = "sync logging"; m->m_desc = "Log fixes?"; m->m_cgi = "slf"; m->m_off = (char *)&g_conf.m_syncLogging - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "union titledb and spiderdb"; m->m_desc = "If a host being sync'd has a title record (cached web " "page) that the " "remote host does not, normally, it would be deleted. " "But if this is true then it is kept. " "Useful for reducing title rec not found errors."; m->m_cgi = "sdu"; m->m_off = (char *)&g_conf.m_syncDoUnion - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "force out of sync"; m->m_desc = "Forces this host to be out of sync."; m->m_cgi = "foos"; m->m_type = TYPE_CMD; m->m_func = CommandForceOutOfSync; m->m_cast = 0; m++; m->m_title = "bytes per second"; m->m_desc = "How many bytes to read per second for syncing. " "Decrease to reduce impact of syncing on query " "response time."; m->m_cgi = "sbps"; m->m_off = (char *)&g_conf.m_syncBytesPerSecond - g; m->m_type = TYPE_LONG; m->m_def = "10000000"; m->m_units = "bytes"; m++; */ ///////////////////// // // DIFFBOT CRAWLBOT PARMS // ////////////////////// /////////// // // DO NOT INSERT parms above here, unless you set // m_obj = OBJ_COLL !!! otherwise it thinks it belongs to // OBJ_CONF as used in the above parms. // /////////// m->m_cgi = "dbtoken"; m->m_xml = "diffbotToken"; m->m_off = (char *)&cr.m_diffbotToken - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_DIFFBOT; m++; m->m_cgi = "createdtime"; m->m_xml = "collectionCreatedTime"; m->m_desc = "Time when this collection was created, or time of " "the last reset or restart."; m->m_off = (char *)&cr.m_diffbotCrawlStartTime - x; m->m_type = TYPE_LONG; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = "0"; m->m_flags = PF_NOAPI;//PF_DIFFBOT; no i want to saveToXml m++; m->m_cgi = "spiderendtime"; m->m_xml = "crawlEndTime"; m->m_desc = "If spider is done, when did it finish."; m->m_off = (char *)&cr.m_diffbotCrawlEndTime - x; m->m_type = TYPE_LONG; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = "0"; m->m_flags = PF_NOAPI;//PF_DIFFBOT; no i want to saveToXml m++; m->m_cgi = "dbcrawlname"; m->m_xml = "diffbotCrawlName"; m->m_off = (char *)&cr.m_diffbotCrawlName - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_DIFFBOT; m++; m->m_cgi = "notifyEmail"; m->m_title = "notify email"; m->m_xml = "notifyEmail"; m->m_off = (char *)&cr.m_notifyEmail - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_DIFFBOT; m++; m->m_cgi = "notifyWebhook"; m->m_xml = "notifyWebhook"; m->m_title = "notify webhook"; m->m_off = (char *)&cr.m_notifyUrl - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_DIFFBOT; m++; // collective respider frequency (for pagecrawlbot.cpp) m->m_title = "collective respider frequency (days)"; m->m_cgi = "repeat"; m->m_xml = "collectiveRespiderFrequency"; m->m_off = (char *)&cr.m_collectiveRespiderFrequency - x; m->m_type = TYPE_FLOAT; m->m_def = "0.0"; // 0.0 m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_units = "days"; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_title = "collective crawl delay (seconds)"; m->m_cgi = "crawlDelay"; m->m_xml = "collectiveCrawlDelay"; m->m_off = (char *)&cr.m_collectiveCrawlDelay - x; m->m_type = TYPE_FLOAT; m->m_def = ".250"; // 250 ms m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m->m_units = "seconds"; m++; m->m_cgi = "urlCrawlPattern"; m->m_xml = "diffbotUrlCrawlPattern"; m->m_title = "url crawl pattern"; m->m_off = (char *)&cr.m_diffbotUrlCrawlPattern - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_cgi = "urlProcessPattern"; m->m_xml = "diffbotUrlProcessPattern"; m->m_title = "url process pattern"; m->m_off = (char *)&cr.m_diffbotUrlProcessPattern - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_cgi = "pageProcessPattern"; m->m_xml = "diffbotPageProcessPattern"; m->m_title = "page process pattern"; m->m_off = (char *)&cr.m_diffbotPageProcessPattern - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_cgi = "urlCrawlRegEx"; m->m_xml = "diffbotUrlCrawlRegEx"; m->m_title = "url crawl regex"; m->m_off = (char *)&cr.m_diffbotUrlCrawlRegEx - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_cgi = "urlProcessRegEx"; m->m_xml = "diffbotUrlProcessRegEx"; m->m_title = "url process regex"; m->m_off = (char *)&cr.m_diffbotUrlProcessRegEx - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_cgi = "maxHops"; m->m_xml = "diffbotHopcount"; m->m_title = "diffbot max hopcount"; m->m_off = (char *)&cr.m_diffbotMaxHops - x; m->m_type = TYPE_LONG; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = "-1"; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_cgi = "onlyProcessIfNew"; m->m_xml = "diffbotOnlyProcessIfNew"; m->m_title = "onlyProcessIfNew"; m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNewUrl - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = "1"; m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; m->m_cgi = "seeds"; m->m_xml = "diffbotSeeds"; m->m_off = (char *)&cr.m_diffbotSeeds - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_flags = PF_DIFFBOT; m->m_def = ""; m++; m->m_xml = "isCustomCrawl"; m->m_off = (char *)&cr.m_isCustomCrawl - x; m->m_type = TYPE_CHAR; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_cgi = "isCustomCrawl"; m->m_def = "0"; m->m_flags = PF_DIFFBOT; m++; m->m_cgi = "maxToCrawl"; m->m_title = "max to crawl"; m->m_xml = "maxToCrawl"; m->m_off = (char *)&cr.m_maxToCrawl - x; m->m_type = TYPE_LONG_LONG; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = "100000"; m->m_flags = PF_DIFFBOT; m++; m->m_cgi = "maxToProcess"; m->m_title = "max to process"; m->m_xml = "maxToProcess"; m->m_off = (char *)&cr.m_maxToProcess - x; m->m_type = TYPE_LONG_LONG; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = "-1"; m->m_flags = PF_DIFFBOT; m++; m->m_cgi = "maxRounds"; m->m_title = "max crawl rounds"; m->m_xml = "maxCrawlRounds"; m->m_off = (char *)&cr.m_maxCrawlRounds - x; m->m_type = TYPE_LONG; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_def = "-1"; m->m_flags = PF_DIFFBOT; m++; ///////////////////// // // new cmd parms // ///////////////////// m->m_title = "insert parm row"; m->m_desc = "insert a row into a parm"; m->m_cgi = "insert"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func = CommandInsertUrlFiltersRow; m->m_cast = 1; m->m_flags = PF_REBUILDURLFILTERS; m++; m->m_title = "remove parm row"; m->m_desc = "remove a row from a parm"; m->m_cgi = "remove"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func = CommandRemoveUrlFiltersRow; m->m_cast = 1; m->m_flags = PF_REBUILDURLFILTERS; m++; m->m_title = "delete collection"; m->m_desc = "delete a collection"; m->m_cgi = "delete"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func2 = CommandDeleteColl; m->m_cast = 1; m++; m->m_title = "delete collection 2"; m->m_desc = "delete the specified collection"; m->m_cgi = "delColl"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func2 = CommandDeleteColl2; m->m_cast = 1; m++; m->m_title = "delete collection"; m->m_desc = "Delete the specified collection. You can specify " "multiple &delcoll= parms in a single request to delete " "multiple collections at once."; // lowercase as opposed to camelcase above m->m_cgi = "delcoll"; m->m_type = TYPE_CMD; m->m_page = PAGE_DELCOLL; m->m_obj = OBJ_COLL; m->m_func2 = CommandDeleteColl2; m->m_cast = 1; m->m_flags = PF_API | PF_REQUIRED; m++; // arg is the collection # to clone from m->m_title = "clone collection"; m->m_desc = "Clone collection settings FROM this collection."; m->m_cgi = "clonecoll"; m->m_type = TYPE_CMD; m->m_page = PAGE_CLONECOLL; m->m_obj = OBJ_COLL; m->m_func = CommandCloneColl; m->m_cast = 1; m->m_flags = PF_API | PF_REQUIRED; m++; m->m_title = "add collection"; m->m_desc = "add a new collection"; // camelcase support m->m_cgi = "addColl"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func = CommandAddColl0; m->m_cast = 1; m++; m->m_title = "add collection"; m->m_desc = "Add a new collection with this name. No spaces " "allowed or strange characters allowed. Max of 64 characters."; // lower case support m->m_cgi = "addcoll"; m->m_type = TYPE_CMD; m->m_page = PAGE_ADDCOLL; m->m_obj = OBJ_COLL; m->m_func = CommandAddColl0; m->m_cast = 1; m->m_flags = PF_API | PF_REQUIRED; m++; // // CLOUD SEARCH ENGINE SUPPORT // // used to prevent a guest ip adding more than one coll m->m_title = "user ip"; m->m_desc = "IP of user adding collection."; m->m_cgi = "userip"; m->m_xml = "userIp"; m->m_off = (char *)&cr.m_userIp - x; m->m_type = TYPE_STRING; m->m_size = 16; m->m_def = ""; m->m_group = 0; m->m_flags = PF_HIDDEN;// | PF_NOSAVE; m->m_page = PAGE_ADDCOLL; m->m_obj = OBJ_COLL; m++; m->m_title = "add custom crawl"; m->m_desc = "add custom crawl"; m->m_cgi = "addCrawl"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func = CommandAddColl1; m->m_cast = 1; m++; m->m_title = "add bulk job"; m->m_desc = "add bulk job"; m->m_cgi = "addBulk"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func = CommandAddColl2; m->m_cast = 1; m++; m->m_title = "in sync"; m->m_desc = "signify in sync with host 0"; m->m_cgi = "insync"; m->m_type = TYPE_CMD; m->m_page = PAGE_NONE; m->m_obj = OBJ_COLL; m->m_func = CommandInSync; m->m_cast = 1; m++; /////////////////////////////////////////// // SEARCH CONTROLS /////////////////////////////////////////// //m->m_title = "allow RAID style list intersection"; //m->m_desc = "Allow using RAID style lookup for intersecting term " // "lists and getting docIds for queries."; //m->m_cgi = "uraid"; //m->m_off = (char *)&cr.m_allowRaidLookup - x; //m->m_type = TYPE_BOOL; //m->m_def = "0"; //m++; //m->m_title = "allow RAIDed term list read"; //m->m_desc = "Allow splitting up the term list read for large lists " // "amongst twins."; //m->m_cgi = "ulraid"; //m->m_off = (char *)&cr.m_allowRaidListRead - x; //m->m_type = TYPE_BOOL; //m->m_def = "0"; //m->m_group = 0; //m++; //m->m_title = "max RAID mercenaries"; //m->m_desc = "Max number of mercenaries to use in RAID lookup and " // "intersection."; //m->m_cgi = "raidm"; //m->m_off = (char *)&cr.m_maxRaidMercenaries - x; //m->m_type = TYPE_LONG; //m->m_def = "2"; //m->m_group = 0; //m++; //m->m_title = "min term list size to RAID"; //m->m_desc = "Term list size to begin doing term list RAID"; //m->m_cgi = "raidsz"; //m->m_off = (char *)&cr.m_minRaidListSize - x; //m->m_type = TYPE_LONG; //m->m_def = "1000000"; //m->m_group = 0; //m++; m->m_title = "restrict indexdb for queries"; m->m_desc = "If this is true Gigablast will only search the root " "index file for docIds. Saves on disk seeks, " "but may use older versions of indexed web pages."; m->m_cgi = "riq"; m->m_off = (char *)&cr.m_restrictIndexdbForQuery - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_def = "0"; //m->m_sparm = 1; //m->m_scgi = "ri"; //m->m_soff = (char *)&si.m_restrictIndexdbForQuery - y; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; m->m_title = "restrict indexdb for xml feed"; m->m_desc = "Like above, but specifically for XML feeds."; m->m_cgi = "rix"; m->m_off = (char *)&cr.m_restrictIndexdbForXML - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; //m->m_title = "restrict indexdb for queries in xml feed"; //m->m_desc = "Same as above, but just for the XML feed."; //m->m_cgi = "riqx"; //m->m_off = (char *)&cr.m_restrictIndexdbForQueryRaw - x; //m->m_type = TYPE_BOOL; //m->m_def = "1"; //m->m_group = 0; //m++; m->m_title = "read from cache by default"; m->m_desc = "Should we read search results from the cache? Set " "to false to fix dmoz bug."; m->m_cgi = "rcd"; m->m_off = (char *)&cr.m_rcache - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "fast results"; m->m_desc = "Use &fast=1 to obtain search results from the much " "faster Gigablast index, although the results are not " "searched as thoroughly."; m->m_obj = OBJ_SI; m->m_page = PAGE_RESULTS; m->m_off = (char *)&si.m_query - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_def = "0"; m->m_cgi = "fast"; //m->m_size = MAX_QUERY_LEN; m->m_flags = PF_COOKIE | PF_WIDGET_PARM | PF_API; m++; m->m_title = "query"; m->m_desc = "The query to perform. See help. " "See the query operators below for " "more info."; m->m_obj = OBJ_SI; m->m_page = PAGE_RESULTS; m->m_off = (char *)&si.m_query - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_cgi = "q"; //m->m_size = MAX_QUERY_LEN; m->m_flags = PF_REQUIRED | PF_COOKIE | PF_WIDGET_PARM | PF_API; m++; // m->m_title = "query2"; // m->m_desc = "The query on which to score inlinkers."; // m->m_obj = OBJ_SI; // m->m_page = PAGE_NONE; // m->m_off = (char *)&si.m_query2 - y; // m->m_type = TYPE_CHARPTR;//STRING; // m->m_cgi = "qq"; // m->m_size = MAX_QUERY_LEN; // m->m_sprpg = 0; // do not store query, needs to be last so related // m->m_sprpp = 0; // topics can append to it // m->m_flags = PF_HIDDEN | PF_NOSAVE; // m++; m->m_title = "collection"; m->m_desc = "Search this collection. Use multiple collection names " "separated by a whitespace to search multiple collections at " "once."; m->m_cgi = "c"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = PF_API | PF_REQUIRED; m->m_off = (char *)&si.m_coll - y; m++; m->m_title = "number of results per query"; m->m_desc = "The number of results returned per page."; // make it 25 not 50 since we only have like 26 balloons m->m_def = "10"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_off = (char *)&si.m_docsWanted - y; m->m_type = TYPE_LONG; m->m_cgi = "n"; m->m_flags = PF_WIDGET_PARM | PF_API; m->m_smin = 0; m++; m->m_title = "first result num"; m->m_desc = "Start displaying at search result #X. Starts at 0."; m->m_def = "0"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_off = (char *)&si.m_firstResultNum - y; m->m_type = TYPE_LONG; m->m_cgi = "s"; m->m_smin = 0; m->m_sprpg = 0; m->m_sprpp = 0; m->m_flags = PF_REDBOX; m++; m->m_title = "show errors"; m->m_desc = "Show errors from generating search result summaries " "rather than just hide the docid. Useful for debugging."; m->m_cgi = "showerrors"; m->m_off = (char *)&si.m_showErrors - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "site cluster"; m->m_desc = "Should search results be site clustered? This " "limits each site to appearing at most twice in the " "search results. Sites are subdomains for the most part, " "like abc.xyz.com."; m->m_cgi = "sc"; m->m_off = (char *)&si.m_doSiteClustering - y; m->m_defOff= (char *)&cr.m_siteClusterByDefault - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "hide all clustered results"; m->m_desc = "Only display at most one result per site."; m->m_cgi = "hacr"; m->m_off = (char *)&si.m_hideAllClustered - y; m->m_defOff= (char *)&cr.m_hideAllClustered - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API; m++; m->m_title = "dedup results"; m->m_desc = "Should duplicate search results be removed? This is " "based on a content hash of the entire document. " "So documents must be exactly the same for the most part."; m->m_cgi = "dr"; // dedupResultsByDefault"; m->m_off = (char *)&si.m_doDupContentRemoval - y; m->m_defOff= (char *)&cr.m_dedupResultsByDefault - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 1; m->m_cgi = "dr"; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "percent similar dedup summary"; m->m_desc = "If document summary (and title) are " "this percent similar " "to a document summary above it, then remove it from the " "search results. 100 means only to remove if exactly the " "same. 0 means no summary deduping. You must also supply " "dr=1 for this to work."; m->m_cgi = "pss"; m->m_off = (char *)&si.m_percentSimilarSummary - y; m->m_defOff= (char *)&cr.m_percentSimilarSummary - x; m->m_type = TYPE_LONG; m->m_group = 0; m->m_smin = 0; m->m_smax = 100; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "dedup URLs"; m->m_desc = "Should we dedup URLs with case insensitivity? This is " "mainly to correct duplicate wiki pages."; m->m_cgi = "ddu"; m->m_off = (char *)&si.m_dedupURL - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "do spell checking"; m->m_desc = "If enabled while using the XML feed, " "when Gigablast finds a spelling recommendation it will be " "included in the XML tag. Default is 0 if using an " "XML feed, 1 otherwise. Will be available again soon."; m->m_cgi = "spell"; m->m_off = (char *)&si.m_spellCheck - y; m->m_type = TYPE_BOOL; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_def = "1"; m->m_flags = PF_API; m++; m->m_title = "stream search results"; m->m_desc = "Stream search results back on socket as they arrive. " "Useful when thousands/millions of search results are " "requested. Required when doing such things otherwise " "Gigablast could run out of memory. Only supported for " "JSON and XML formats, not HTML."; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_off = (char *)&si.m_streamResults - y; m->m_type = TYPE_CHAR; m->m_def = "0"; m->m_cgi = "stream"; m->m_flags = PF_API; m->m_sprpg = 0; // propagate to next 10 m->m_sprpp = 0; m++; m->m_title = "seconds back"; m->m_desc = "Limit results to pages spidered this many seconds ago. " "Use 0 to disable."; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_off = (char *)&si.m_secsBack - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_cgi = "secsback"; m->m_flags = PF_API; m++; m->m_title = "sort by"; m->m_desc = "Use 0 to sort results by relevance, 1 to sort by " "most recent spider date down, and 2 to sort by oldest " "spidered results first."; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_off = (char *)&si.m_sortBy - y; m->m_type = TYPE_CHAR; m->m_def = "0"; // this means relevance m->m_cgi = "sortby"; m->m_flags = PF_API; m++; m->m_title = "filetype"; m->m_desc = "Restrict results to this filetype. Supported " "filetypes are pdf, doc, html xml, json, xls."; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_off = (char *)&si.m_filetype - y; m->m_type = TYPE_CHARPTR; m->m_def = ""; m->m_cgi = "filetype"; m->m_flags = PF_API; m++; m->m_title = "get scoring info"; m->m_desc = "Get scoring information for each result so you " "can see how each result is scored. You must explicitly " "request this using &scores=1 for the XML feed because it " "is not included by default."; m->m_cgi = "scores"; // dedupResultsByDefault"; m->m_off = (char *)&si.m_getDocIdScoringInfo - y; m->m_type = TYPE_BOOL; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_def = NULL; m->m_flags = PF_API; // get default from collectionrec item m->m_defOff= (char *)&cr.m_getDocIdScoringInfo - x; m++; m->m_title = "do query expansion"; m->m_desc = "If enabled, query expansion will expand your query " "to include the various forms and " "synonyms of the query terms."; m->m_off = (char *)&si.m_queryExpansion - y; m->m_defOff= (char *)&cr.m_queryExpansion - x; m->m_type = TYPE_BOOL; m->m_cgi = "qe"; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; // more general parameters m->m_title = "max search results"; m->m_desc = "What is the maximum total number " "of returned search results."; m->m_cgi = "msr"; m->m_off = (char *)&cr.m_maxSearchResults - x; m->m_type = TYPE_LONG; m->m_def = "1000"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max search results per query"; m->m_desc = "What is the limit to the total number " "of returned search results per query?"; m->m_cgi = "msrpq"; m->m_off = (char *)&cr.m_maxSearchResultsPerQuery - x; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_flags = 0; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max search results for paying clients"; m->m_desc = "What is the limit to the total number " "of returned search results for clients."; m->m_cgi = "msrfpc"; m->m_off = (char *)&cr.m_maxSearchResultsForClients - x; m->m_type = TYPE_LONG; m->m_def = "1000"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max search results per query for paying clients"; m->m_desc = "What is the limit to the total number " "of returned search results per query for paying clients? " "Auto ban must be enabled for this to work."; m->m_cgi = "msrpqfc"; m->m_off = (char *)&cr.m_maxSearchResultsPerQueryForClients - x; m->m_type = TYPE_LONG; m->m_def = "1000"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "user ip"; m->m_desc = "The ip address of the searcher. We can pass back " "for use in the autoban technology which bans abusive IPs."; m->m_obj = OBJ_SI; m->m_page = PAGE_RESULTS; m->m_off = (char *)&si.m_userIpStr - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_cgi = "uip"; m->m_flags = PF_COOKIE | PF_WIDGET_PARM | PF_API; m++; m->m_title = "use min ranking algo"; m->m_desc = "Should search results be ranked using this algo?"; //m->m_cgi = "uma"; //m->m_off = (char *)&cr.m_siteClusterByDefault - x; m->m_off = (char *)&si.m_useMinAlgo - y; m->m_type = TYPE_BOOL; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; // seems, good, default it on m->m_def = "1"; m->m_cgi = "uma"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; // limit to this # of the top term pairs from inlink text whose // score is accumulated m->m_title = "real max top"; m->m_desc = "Only score up to this many inlink text term pairs"; m->m_off = (char *)&si.m_realMaxTop - y; m->m_type = TYPE_LONG; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_def = "10"; m->m_cgi = "rmt"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; m->m_title = "use new ranking algo"; m->m_desc = "Should search results be ranked using this new algo?"; m->m_off = (char *)&si.m_useNewAlgo - y; m->m_type = TYPE_BOOL; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; // seems, good, default it on m->m_def = "1"; m->m_cgi = "una"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; m->m_title = "do max score algo"; m->m_desc = "Quickly eliminated docids using max score algo"; m->m_off = (char *)&si.m_doMaxScoreAlgo - y; m->m_type = TYPE_BOOL; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_def = "1"; m->m_cgi = "dmsa"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; m->m_title = "use fast intersection algo"; m->m_desc = "Should we try to speed up search results generation?"; m->m_off = (char *)&si.m_fastIntersection - y; m->m_type = TYPE_CHAR; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; // turn off until we debug m->m_def = "-1"; m->m_cgi = "fi"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; m->m_title = "max number of facets to return"; m->m_desc = "Max number of facets to return"; m->m_off = (char *)&si.m_maxFacets - y; m->m_type = TYPE_LONG; m->m_def = "50"; m->m_group = 1; m->m_cgi = "nf"; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; // m->m_title = "special query"; // m->m_desc = "List of docids to restrain results to."; // m->m_cgi = "sq"; // m->m_off = (char *)&si.m_sq - y; // m->m_type = TYPE_CHARPTR; // m->m_def = NULL; // m->m_group = 0; // m++; // m->m_title = "negative docids"; // m->m_desc = "List of docids to ignore."; // m->m_cgi = "nodocids"; // m->m_off = (char *)&si.m_noDocIds - y; // m->m_type = TYPE_CHARPTR; // m->m_def = NULL; // m->m_group = 0; // m++; // m->m_title = "negative siteids"; // m->m_desc = "Whitespace-separated list of 32-bit sitehashes " //"to ignore."; // m->m_cgi = "nositeids"; // m->m_off = (char *)&si.m_noSiteIds - y; // m->m_type = TYPE_CHARPTR; // m->m_def = NULL; // m->m_group = 0; // m++; m->m_title = "language weight"; m->m_desc = "Default language weight if document matches quer " "language. Use this to give results that match the specified " "the specified &qlang higher ranking, or docs whose language " "is unnknown. Can be override with " "&langw in the query url."; m->m_cgi = "langweight"; m->m_off = (char *)&cr.m_sameLangWeight - x; m->m_type = TYPE_FLOAT; m->m_def = "20.000000"; m->m_group = 1; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "use language weights"; m->m_desc = "Use Language weights to sort query results. " "This will give results that match the specified &qlang " "higher ranking."; m->m_cgi = "lsort"; m->m_off = (char *)&cr.m_enableLanguageSorting - x; //m->m_soff = (char *)&si.m_enableLanguageSorting - y; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 1; //m->m_scgi = "lsort"; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "sort language preference"; m->m_desc = "Default language to use for ranking results. " //"This should only be used on limited collections. " "Value should be any language abbreviation, for example " "\"en\" for English. Use xx to give ranking " "boosts to no language in particular. See the language " "abbreviations at the bottom of the " "url filters page."; m->m_cgi = "qlang"; m->m_off = (char *)&si.m_defaultSortLang - y; m->m_type = TYPE_CHARPTR; //m->m_size = 6; // up to 5 chars + NULL, e.g. "en_US" m->m_def = "";//"xx";//_US"; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "language weight"; m->m_desc = "Use this to override the default language weight " "for this collection. The default language weight can be " "set in the search controls and is usually something like " "20.0. Which means that we multiply a result's score by 20 " "if from the same language as the query or the language is " "unknown."; m->m_off = (char *)&si.m_sameLangWeight - y; m->m_defOff= (char *)&cr.m_sameLangWeight - x; m->m_type = TYPE_FLOAT; m->m_cgi = "langw"; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "sort country preference"; m->m_desc = "Default country to use for ranking results. " //"This should only be used on limited collections. " "Value should be any country code abbreviation, for example " "\"us\" for United States. This is currently not working."; m->m_cgi = "qcountry"; m->m_off = (char *)&si.m_defaultSortCountry - y; m->m_type = TYPE_CHARPTR; m->m_size = 2+1; m->m_def = "us"; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; /* m->m_title = "language method weights"; m->m_desc = "Language method weights for spider language " "detection. A string of ascii numerals that " "should default to 895768712"; m->m_cgi = "lmweights"; m->m_off = (char *)&cr.m_languageMethodWeights - x; m->m_type = TYPE_STRING; m->m_size = 10; // up to 9 chars + NULL m->m_def = "894767812"; m->m_group = 0; // m->m_sparm = 1; m++; m->m_title = "language detection sensitivity"; m->m_desc = "Language detection sensitivity. Higher" " values mean higher hitrate, but lower accuracy." " Suggested values are from 2 to 20"; m->m_cgi = "lmbailout"; m->m_off = (char *)&cr.m_languageBailout - x; m->m_type = TYPE_LONG; m->m_def = "5"; m->m_group = 0; // m->m_sparm = 1; m++; m->m_title = "language detection threshold"; m->m_desc = "Language detection threshold sensitivity." " Higher values mean better accuracy, but lower hitrate." " Suggested values are from 2 to 20"; m->m_cgi = "lmthreshold"; m->m_off = (char *)&cr.m_languageThreshold - x; m->m_type = TYPE_LONG; m->m_def = "3"; m->m_group = 0; // m->m_sparm = 1; m++; m->m_title = "language detection samplesize"; m->m_desc = "Language detection size. Higher values" " mean more accuracy, but longer processing time." " Suggested values are 300-1000"; m->m_cgi = "lmsamples"; m->m_off = (char *)&cr.m_languageSamples - x; m->m_type = TYPE_LONG; m->m_def = "600"; m->m_group = 0; // m->m_sparm = 1; m++; m->m_title = "language detection spider samplesize"; m->m_desc = "Language detection page sample size. " "Higher values mean more accuracy, but longer " "spider time." " Suggested values are 3000-10000"; m->m_cgi = "lpsamples"; m->m_off = (char *)&cr.m_langPageLimit - x; m->m_type = TYPE_LONG; m->m_def = "6000"; m->m_group = 0; // m->m_sparm = 1; m++; */ m->m_title = "docs to check for post query"; m->m_desc = "How many search results should we " "scan for post query demotion? " "0 disables all post query reranking. "; m->m_cgi = "pqrds"; m->m_off = (char *)&si.m_docsToScanForReranking - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "demotion for foreign languages"; m->m_desc = "Demotion factor of non-relevant languages. Score " "will be penalized by this factor as a percent if " "it's language is foreign. " "A safe value is probably anywhere from 0.5 to 1. "; m->m_cgi = "pqrlang"; m->m_off = (char *)&cr.m_languageWeightFactor - x; //m->m_soff = (char *)&si.m_languageWeightFactor - y; m->m_type = TYPE_FLOAT; m->m_def = "0.999"; m->m_group = 0; //m->m_scgi = "pqrlang"; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for unknown languages"; m->m_desc = "Demotion factor for unknown languages. " "Page's score will be penalized by this factor as a percent " "if it's language is not known. " "A safe value is 0, as these pages will be reranked by " "country (see below). " "0 means no demotion."; m->m_cgi = "pqrlangunk"; m->m_off = (char *)&cr.m_languageUnknownWeight- x; //m->m_soff = (char *)&si.m_languageUnknownWeight- y; m->m_type = TYPE_FLOAT; m->m_def = "0.0"; m->m_group = 0; //m->m_scgi = "pqrlangunk"; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages where the country of the page writes " "in the same language as the country of the query"; m->m_desc = "Demotion for pages where the country of the page writes " "in the same language as the country of the query. " "If query language is the same as the language of the page, " "then if a language written in the country of the page matches " "a language written by the country of the query, then page's " "score will be demoted by this factor as a percent. " "A safe range is between 0.5 and 1. "; m->m_cgi = "pqrcntry"; m->m_off = (char *)&cr.m_pqr_demFactCountry - x; m->m_type = TYPE_FLOAT; m->m_def = "0.98"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for query terms or gigabits in url"; m->m_desc = "Demotion factor for query terms or gigabits " "in a result's url. " "Score will be penalized by this factor times the number " "of query terms or gigabits in the url divided by " "the max value below such that fewer " "query terms or gigabits in the url causes the result " "to be demoted more heavily, depending on the factor. " "Higher factors demote more per query term or gigabit " "in the page's url. " "Generally, a page may not be demoted more than this " "factor as a percent. Also, how it is demoted is " "dependent on the max value. For example, " "a factor of 0.2 will demote the page 20% if it has no " "query terms or gigabits in its url. And if the max value is " "10, then a page with 5 query terms or gigabits in its " "url will be demoted 10%; and 10 or more query terms or " "gigabits in the url will not be demoted at all. " "0 means no demotion. " "A safe range is from 0 to 0.35. "; m->m_cgi = "pqrqttiu"; m->m_off = (char *)&cr.m_pqr_demFactQTTopicsInUrl - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages with query terms or gigabits " "in url"; m->m_desc = "Max number of query terms or gigabits in a url. " "Pages with a number of query terms or gigabits in their " "urls greater than or equal to this value will not be " "demoted. " "This controls the range of values expected to represent " "the number of query terms or gigabits in a url. It should " "be set to or near the estimated max number of query terms " "or topics that can be in a url. Setting to a lower value " "increases the penalty per query term or gigabit that is " "not in a url, but decreases the range of values that " "will be demoted."; m->m_cgi = "pqrqttium"; m->m_off = (char *)&cr.m_pqr_maxValQTTopicsInUrl - x; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages that are not high quality"; m->m_desc = "Demotion factor for pages that are not high quality. " "Score is penalized by this number as a percent times level " "of quality. A pqge will be demoted by the formula " "(max quality - page's quality) * this factor / the max " "value given below. Generally, a page will not be " "demoted more than this factor as a percent. " "0 means no demotion. " "A safe range is between 0 to 1. "; m->m_cgi = "pqrqual"; m->m_off = (char *)&cr.m_pqr_demFactQual - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages that are not high quality"; m->m_desc = "Max page quality. Pages with a quality level " "equal to or higher than this value " "will not be demoted. "; m->m_cgi = "pqrqualm"; m->m_off = (char *)&cr.m_pqr_maxValQual - x; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages that are not " "root or have many paths in the url"; m->m_desc = "Demotion factor each path in the url. " "Score will be demoted by this factor as a percent " "multiplied by the number of paths in the url divided " "by the max value below. " "Generally, the page will not be demoted more than this " "value as a percent. " "0 means no demotion. " "A safe range is from 0 to 0.75. "; m->m_cgi = "pqrpaths"; m->m_off = (char *)&cr.m_pqr_demFactPaths - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages that have many paths in the url"; m->m_desc = "Max number of paths in a url. " "This should be set to a value representing a very high " "number of paths for a url. Lower values increase the " "difference between how much each additional path demotes. "; m->m_cgi = "pqrpathsm"; m->m_off = (char *)&cr.m_pqr_maxValPaths - x; m->m_type = TYPE_LONG; m->m_def = "16"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages that do not have a catid"; m->m_desc = "Demotion factor for pages that do not have a catid. " "Score will be penalized by this factor as a percent. " "A safe range is from 0 to 0.2. "; m->m_cgi = "pqrcatid"; m->m_off = (char *)&cr.m_pqr_demFactNoCatId - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages where smallest " "catid has a lot of super topics"; m->m_desc = "Demotion factor for pages where smallest " "catid has a lot of super topics. " "Page will be penalized by the number of super topics " "multiplied by this factor divided by the max value given " "below. " "Generally, the page will not be demoted more than this " "factor as a percent. " "Note: pages with no catid are demoted by this factor as " "a percent so as not to penalize pages with a catid. " "0 means no demotion. " "A safe range is between 0 and 0.25. "; m->m_cgi = "pqrsuper"; m->m_off = (char *)&cr.m_pqr_demFactCatidHasSupers - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages where smallest catid has a lot " "of super topics"; m->m_desc = "Max number of super topics. " "Pages whose smallest catid that has more super " "topics than this will be demoted by the maximum amount " "given by the factor above as a percent. " "This should be set to a value representing a very high " "number of super topics for a category id. " "Lower values increase the difference between how much each " "additional path demotes. "; m->m_cgi = "pqrsuperm"; m->m_off = (char *)&cr.m_pqr_maxValCatidHasSupers - x; m->m_type = TYPE_LONG; m->m_def = "11"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for larger pages"; m->m_desc = "Demotion factor for larger pages. " "Page will be penalized by its size times this factor " "divided by the max page size below. " "Generally, a page will not be demoted more than this " "factor as a percent. " "0 means no demotion. " "A safe range is between 0 and 0.25. "; m->m_cgi = "pqrpgsz"; m->m_off = (char *)&cr.m_pqr_demFactPageSize - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for larger pages"; m->m_desc = "Max page size. " "Pages with a size greater than or equal to this will be " "demoted by the max amount (the factor above as a percent). "; m->m_cgi = "pqrpgszm"; m->m_off = (char *)&cr.m_pqr_maxValPageSize - x; m->m_type = TYPE_LONG; m->m_def = "524288"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for non-location specific queries " "with a location specific title"; m->m_desc = "Demotion factor for non-location specific queries " "with a location specific title. " "Pages which contain a location in their title which is " "not in the query or the gigabits will be demoted by their " "population multiplied by this factor divided by the max " "place population specified below. " "Generally, a page will not be demoted more than this " "value as a percent. " "0 means no demotion. "; m->m_cgi = "pqrloct"; m->m_off = (char *)&cr.m_pqr_demFactLocTitle - x; //m->m_scgi = "pqrloct"; //m->m_soff = (char *)&si.m_pqr_demFactLocTitle - y; m->m_type = TYPE_FLOAT; m->m_def = "0.99"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for non-location specific queries " "with a location specific summary"; m->m_desc = "Demotion factor for non-location specific queries " "with a location specific summary. " "Pages which contain a location in their summary which is " "not in the query or the gigabits will be demoted by their " "population multiplied by this factor divided by the max " "place population specified below. " "Generally, a page will not be demoted more than this " "value as a percent. " "0 means no demotion. "; m->m_cgi = "pqrlocs"; m->m_off = (char *)&cr.m_pqr_demFactLocSummary - x; //m->m_scgi = "pqrlocs"; //m->m_soff = (char *)&si.m_pqr_demFactLocSummary - y; m->m_type = TYPE_FLOAT; m->m_def = "0.95"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for non-location specific queries " "with a location specific dmoz category"; m->m_desc = "Demotion factor for non-location specific queries " "with a location specific dmoz regional category. " "Pages which contain a location in their dmoz which is " "not in the query or the gigabits will be demoted by their " "population multiplied by this factor divided by the max " "place population specified below. " "Generally, a page will not be demoted more than this " "value as a percent. " "0 means no demotion. "; m->m_cgi = "pqrlocd"; m->m_off = (char *)&cr.m_pqr_demFactLocDmoz - x; //m->m_scgi = "pqrlocd"; //m->m_soff = (char *)&si.m_pqr_demFactLocDmoz - y; m->m_type = TYPE_FLOAT; m->m_def = "0.95"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demote locations that appear in gigabits"; m->m_desc = "Demote locations that appear in gigabits."; m->m_cgi = "pqrlocg"; m->m_off = (char *)&cr.m_pqr_demInTopics - x; //m->m_scgi = "pqrlocg"; //m->m_soff = (char *)&si.m_pqr_demInTopics - y; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for non-location specific queries " "with location specific results"; m->m_desc = "Max place population. " "Places with a population greater than or equal to this " "will be demoted to the maximum amount given by the " "factor above as a percent. "; m->m_cgi = "pqrlocm"; m->m_off = (char *)&cr.m_pqr_maxValLoc - x; m->m_type = TYPE_LONG; // charlottesville was getting missed when this was 1M m->m_def = "100000"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for non-html"; m->m_desc = "Demotion factor for content type that is non-html. " "Pages which do not have an html content type will be " "demoted by this factor as a percent. " "0 means no demotion. " "A safe range is between 0 and 0.35. "; m->m_cgi = "pqrhtml"; m->m_off = (char *)&cr.m_pqr_demFactNonHtml - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for xml"; m->m_desc = "Demotion factor for content type that is xml. " "Pages which have an xml content type will be " "demoted by this factor as a percent. " "0 means no demotion. " "Any value between 0 and 1 is safe if demotion for non-html " "is set to 0. Otherwise, 0 should probably be used. "; m->m_cgi = "pqrxml"; m->m_off = (char *)&cr.m_pqr_demFactXml - x; m->m_type = TYPE_FLOAT; m->m_def = "0.95"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages with other pages from same " "hostname"; m->m_desc = "Demotion factor for pages with fewer other pages from " "same hostname. " "Pages with results from the same host will be " "demoted by this factor times each fewer host than the max " "value given below, divided by the max value. " "Generally, a page will not be demoted more than this " "factor as a percent. " "0 means no demotion. " "A safe range is between 0 and 0.35. "; m->m_cgi = "pqrfsd"; m->m_off = (char *)&cr.m_pqr_demFactOthFromHost - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages with other pages from same " "domain"; m->m_desc = "Max number of pages from same domain. " "Pages which have this many or more pages from the same " "domain will not be demoted. "; m->m_cgi = "pqrfsdm"; m->m_off = (char *)&cr.m_pqr_maxValOthFromHost - x; m->m_type = TYPE_LONG; m->m_def = "12"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "initial demotion for pages with common " "topics in dmoz as other results"; m->m_desc = "Initial demotion factor for pages with common " "topics in dmoz as other results. " "Pages will be penalized by the number of common topics " "in dmoz times this factor divided by the max value " "given below. " "Generally, a page will not be demoted by more than this " "factor as a percent. " "Note: this factor is decayed by the factor specified in " "the parm below, decay for pages with common topics in " "dmoz as other results, as the number of pages with " "common topics in dmoz increases. " "0 means no demotion. " "A safe range is between 0 and 0.35. "; m->m_cgi = "pqrctid"; m->m_off = (char *)&cr.m_pqr_demFactComTopicInDmoz - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "decay for pages with common topics in dmoz " "as other results"; m->m_desc = "Decay factor for pages with common topics in " "dmoz as other results. " "The initial demotion factor will be decayed by this factor " "as a percent as the number of common topics increase. " "0 means no decay. " "A safe range is between 0 and 0.25. "; m->m_cgi = "pqrctidd"; m->m_off = (char *)&cr.m_pqr_decFactComTopicInDmoz - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages with common topics in dmoz " "as other results"; m->m_desc = "Max number of common topics in dmoz as other results. " "Pages with a number of common topics equal to or greater " "than this value will be demoted to the maximum as given " "by the initial factor above as a percent. "; m->m_cgi = "pqrctidm"; m->m_off = (char *)&cr.m_pqr_maxValComTopicInDmoz - x; m->m_type = TYPE_LONG; m->m_def = "32"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages where dmoz category names " "contain query terms or their synonyms"; m->m_desc = "Demotion factor for pages where dmoz category names " "contain fewer query terms or their synonyms. " "Pages will be penalized for each query term or synonym of " "a query term less than the max value given below multiplied " "by this factor, divided by the max value. " "Generally, a page will not be demoted more than this value " "as a percent. " "0 means no demotion. " "A safe range is between 0 and 0.3. "; m->m_cgi = "pqrdcndcqt"; m->m_off = (char *)&cr.m_pqr_demFactDmozCatNmNoQT - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages where dmoz category names " "contain query terms or their synonyms"; m->m_desc = "Max number of query terms and their synonyms " "in a page's dmoz category name. " "Pages with a number of query terms or their synonyms in all " "dmoz category names greater than or equal to this value " "will not be demoted. "; m->m_cgi = "pqrcndcqtm"; m->m_off = (char *)&cr.m_pqr_maxValDmozCatNmNoQT - x; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages where dmoz category names " "contain gigabits"; m->m_desc = "Demotion factor for pages where dmoz category " "names contain fewer gigabits. " "Pages will be penalized by the number of gigabits in all " "dmoz category names fewer than the max value given below " "divided by the max value. " "Generally, a page will not be demoted more than than this " "factor as a percent. " "0 means no demotion. " "A safe range is between 0 and 0.3. "; m->m_cgi = "pqrdcndcgb"; m->m_off = (char *)&cr.m_pqr_demFactDmozCatNmNoGigabits - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for pages where dmoz category names " "contain gigabits"; m->m_desc = "Max number of pages where dmoz category names " "contain a gigabit. " "Pages with a number of gigabits in all dmoz category names " "greater than or equal to this value will not be demoted. "; m->m_cgi = "pqrdcndcgbm"; m->m_off = (char *)&cr.m_pqr_maxValDmozCatNmNoGigabits - x; m->m_type = TYPE_LONG; m->m_def = "16"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages based on datedb date"; m->m_desc = "Demotion factor for pages based on datedb date. " "Pages will be penalized for being published earlier than the " "max date given below. " "The older the page, the more it will be penalized based on " "the time difference between the page's date and the max date, " "divided by the max date. " "Generally, a page will not be demoted more than this " "value as a percent. " "0 means no demotion. " "A safe range is between 0 and 0.4. "; m->m_cgi = "pqrdate"; m->m_off = (char *)&cr.m_pqr_demFactDatedbDate - x; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "min value for demotion based on datedb date "; m->m_desc = "Pages with a publish date equal to or earlier than " "this date will be demoted to the max (the factor above as " "a percent). " "Use this parm in conjunction with the max value below " "to specify the range of dates where demotion occurs. " "If you set this parm near the estimated earliest publish " "date that occurs somewhat frequently, this method can better " "control the additional demotion per publish day. " "This number is given as seconds since the epoch, January 1st, " "1970 divided by 1000. " "0 means use the epoch. "; m->m_cgi = "pqrdatei"; m->m_off = (char *)&cr.m_pqr_minValDatedbDate - x; m->m_type = TYPE_LONG; m->m_def = "631177"; // Jan 01, 1990 m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for demotion based on datedb date "; m->m_desc = "Pages with a publish date greater than or equal to " "this value divided by 1000 will not be demoted. " "Use this parm in conjunction with the min value above " "to specify the range of dates where demotion occurs. " "This number is given as seconds before the current date " "and time taken from the system clock divided by 1000. " "0 means use the current time of the current day. "; m->m_cgi = "pqrdatem"; m->m_off = (char *)&cr.m_pqr_maxValDatedbDate - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages based on proximity"; m->m_desc = "Demotion factor for proximity of query terms in " "a document. The closer together terms occur in a " "document, the higher it will score." "0 means no demotion. "; m->m_cgi = "pqrprox"; m->m_off = (char *)&cr.m_pqr_demFactProximity - x; //m->m_scgi = "pqrprox"; //m->m_soff = (char *)&si.m_pqr_demFactProximity - y; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for pages based on query terms section"; m->m_desc = "Demotion factor for where the query terms occur " "in the document. If the terms only occur in a menu, " "a link, or a list, the document will be punished." "0 means no demotion. "; m->m_cgi = "pqrinsec"; //m->m_scgi = "pqrinsec"; m->m_off = (char *)&cr.m_pqr_demFactInSection - x; //m->m_soff = (char *)&si.m_pqr_demFactInSection - y; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "weight of indexed score on pqr"; m->m_desc = "The proportion that the original score affects " "its rerank position. A factor of 1 will maintain " "the original score, 0 will only use the indexed " "score to break ties."; m->m_cgi = "pqrorig"; //m->m_scgi = "pqrorig"; m->m_off = (char *)&cr.m_pqr_demFactOrigScore - x; //m->m_soff = (char *)&si.m_pqr_demFactOrigScore - y; m->m_type = TYPE_FLOAT; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max value for demotion for pages based on proximity"; m->m_desc = "Max summary score where no more demotion occurs above. " "Pages with a summary score greater than or equal to this " "value will not be demoted. "; m->m_cgi = "pqrproxm"; m->m_off = (char *)&cr.m_pqr_maxValProximity - x; m->m_type = TYPE_LONG; m->m_def = "100000"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion for query being exclusivly in a subphrase"; m->m_desc = "Search result which contains the query terms only" " as a subphrase of a larger phrase will have its score " " reduced by this percent."; m->m_cgi = "pqrspd"; m->m_off = (char *)&cr.m_pqr_demFactSubPhrase - x; //m->m_soff = (char *)&si.m_pqr_demFactSubPhrase - y; //m->m_scgi = "pqrspd"; m->m_type = TYPE_FLOAT; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "demotion based on common inlinks"; m->m_desc = "Based on the number of inlinks a search results has " "which are in common with another search result."; m->m_cgi = "pqrcid"; m->m_off = (char *)&cr.m_pqr_demFactCommonInlinks - x; //m->m_soff = (char *)&si.m_pqr_demFactCommonInlinks - y; //m->m_scgi = "pqrcid"; m->m_type = TYPE_FLOAT; m->m_def = ".5"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of document calls multiplier"; m->m_desc = "Allows more results to be gathered in the case of " "an index having a high rate of duplicate results. Generally" " expressed as 1.2"; m->m_cgi = "ndm"; m->m_off = (char *)&cr.m_numDocsMultiplier - x; m->m_type = TYPE_FLOAT; m->m_def = "1.2"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; /* m->m_title = "max documents to compute per host"; m->m_desc = "Limit number of documents to search that do not provide" " the required results."; m->m_cgi = "mdi"; m->m_off = (char *)&cr.m_maxDocIdsToCompute - x; m->m_type = TYPE_LONG; m->m_def = "1000"; m->m_group = 0; m++; */ m->m_title = "max real time inlinks"; m->m_desc = "Limit number of linksdb inlinks requested per result."; m->m_cgi = "mrti"; m->m_off = (char *)&cr.m_maxRealTimeInlinks - x; //m->m_soff = (char *)&si.m_maxRealTimeInlinks - y; m->m_type = TYPE_LONG; m->m_def = "10000"; m->m_group = 0; //m->m_scgi = "mrti"; m->m_smin = 0; m->m_smax = 100000; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "percent topic similar default"; m->m_desc = "Like above, but used for deciding when to cluster " "results by topic for the news collection."; m->m_cgi = "ptcd"; m->m_off = (char *)&cr.m_topicSimilarCutoffDefault - x; m->m_type = TYPE_LONG; m->m_def = "50"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max query terms"; m->m_desc = "Do not allow more than this many query terms. Helps " "prevent big queries from resource hogging."; m->m_cgi = "mqt"; m->m_off = (char *)&cr.m_maxQueryTerms - x; //m->m_soff = (char *)&si.m_maxQueryTerms - y; m->m_type = TYPE_LONG; m->m_def = "999999"; // now we got synonyms... etc m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; /* m->m_title = "dictionary site"; m->m_desc = "Where do we send requests for definitions of search " "terms. Set to the empty string to turn this feature off."; m->m_cgi = "dictionarySite"; m->m_off = (char *)&cr.m_dictionarySite - x; m->m_type = TYPE_STRING; m->m_size = SUMMARYHIGHLIGHTTAGMAXSIZE; m->m_def = "http://www.answers.com/"; m++; */ /* m->m_title = "allow links: searches"; m->m_desc = "Allows anyone access to perform links: searches on this " "collection."; m->m_cgi = "als"; m->m_off = (char *)&cr.m_allowLinksSearch - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ // REFERENCE PAGES CONTROLS m->m_title = "number of reference pages to generate"; m->m_desc = "What is the number of " "reference pages to generate per query? Set to 0 to save " "CPU time."; m->m_cgi = "nrp"; m->m_off = (char *)&cr.m_refs_numToGenerate - x; //m->m_soff = (char *)&si.m_refs_numToGenerate - y; m->m_smaxc = (char *)&cr.m_refs_numToGenerateCeiling - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_priv = 0; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of reference pages to generate"; m->m_desc = "What is the number of " "reference pages to generate per query? Set to 0 to save " "CPU time."; m->m_cgi = "snrp"; m->m_off = (char *)&si.m_refs_numToGenerate - y; m->m_type = TYPE_LONG; m->m_defOff =(char *)&cr.m_refs_numToGenerate - x; m->m_priv = 0; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "number of reference pages to display"; m->m_desc = "What is the number of " "reference pages to display per query?"; m->m_cgi = "nrpdd"; m->m_off = (char *)&cr.m_refs_numToDisplay - x; //m->m_soff = (char *)&si.m_refs_numToDisplay - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_priv = 0; // allow the (more) link m->m_sprpg = 0; // do not propagate m->m_sprpp = 0; // do not propagate m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "docs to scan for reference pages"; m->m_desc = "How many search results should we " "scan for reference pages per query?"; m->m_cgi = "dsrp"; m->m_off = (char *)&cr.m_refs_docsToScan - x; //m->m_soff = (char *)&si.m_refs_docsToScan - y; m->m_smaxc = (char *)&cr.m_refs_docsToScanCeiling - x; m->m_type = TYPE_LONG; m->m_def = "30"; m->m_group = 0; m->m_priv = 0; m->m_smin = 0; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; m->m_title = "min references quality"; m->m_desc = "References with page quality below this " "will be excluded. (set to 101 to disable references while " "still generating related pages."; m->m_cgi = "mrpq"; m->m_off = (char *)&cr.m_refs_minQuality - x; //m->m_soff = (char *)&si.m_refs_minQuality - y; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "min links per references"; m->m_desc = "References need this many links to results to " "be included."; m->m_cgi = "mlpr"; m->m_off = (char *)&cr.m_refs_minLinksPerReference - x; //m->m_soff = (char *)&si.m_refs_minLinksPerReference - y; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max linkers to consider for references per page"; m->m_desc = "Stop processing referencing pages after hitting this " "limit."; m->m_cgi = "mrpl"; m->m_off = (char *)&cr.m_refs_maxLinkers - x; //m->m_soff = (char *)&si.m_refs_maxLinkers - y; m->m_smaxc = (char *)&cr.m_refs_maxLinkersCeiling - x; m->m_type = TYPE_LONG; m->m_def = "500"; m->m_group = 0; m->m_priv = 2; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "page fetch multiplier for references"; m->m_desc = "Use this multiplier to fetch more than the required " "number of reference pages. fetches N * (this parm) " "references and displays the top scoring N."; m->m_cgi = "ptrfr"; m->m_off = (char *)&cr.m_refs_additionalTRFetch - x; //m->m_soff = (char *)&si.m_refs_additionalTRFetch - y; m->m_smaxc = (char *)&cr.m_refs_additionalTRFetchCeiling - x; m->m_type = TYPE_FLOAT; m->m_def = "1.5"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of links coefficient"; m->m_desc = "A in A * numLinks + B * quality + C * " "numLinks/totalLinks."; m->m_cgi = "nlc"; m->m_off = (char *)&cr.m_refs_numLinksCoefficient - x; //m->m_soff = (char *)&si.m_refs_numLinksCoefficient - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "quality coefficient"; m->m_desc = "B in A * numLinks + B * quality + C * " "numLinks/totalLinks."; m->m_cgi = "qc"; m->m_off = (char *)&cr.m_refs_qualityCoefficient - x; //m->m_soff = (char *)&si.m_refs_qualityCoefficient - y; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "link density coefficient"; m->m_desc = "C in A * numLinks + B * quality + C * " "numLinks/totalLinks."; m->m_cgi = "ldc"; m->m_off = (char *)&cr.m_refs_linkDensityCoefficient - x; //m->m_soff = (char *)&si.m_refs_linkDensityCoefficient - y; m->m_type = TYPE_LONG; m->m_def = "1000"; m->m_group = 0; m->m_priv = 2; //m->m_sparm = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "add or multiply quality times link density"; m->m_desc = "[+|*] in A * numLinks + B * quality [+|*]" " C * numLinks/totalLinks."; m->m_cgi = "mrs"; m->m_off = (char *)&cr.m_refs_multiplyRefScore - x; //m->m_soff = (char *)&si.m_refs_multiplyRefScore - y; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; // reference pages ceiling parameters m->m_title = "maximum allowed value for " "numReferences parameter"; m->m_desc = "maximum allowed value for " "numReferences parameter"; m->m_cgi = "nrpc"; m->m_off = (char *)&cr.m_refs_numToGenerateCeiling - x; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "maximum allowed value for " "docsToScanForReferences parameter"; m->m_desc = "maximum allowed value for " "docsToScanForReferences parameter"; m->m_cgi = "dsrpc"; m->m_off = (char *)&cr.m_refs_docsToScanCeiling - x; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "maximum allowed value for " "maxLinkers parameter"; m->m_desc = "maximum allowed value for " "maxLinkers parameter"; m->m_cgi = "mrplc"; m->m_off = (char *)&cr.m_refs_maxLinkersCeiling - x; m->m_type = TYPE_LONG; m->m_def = "5000"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "maximum allowed value for " "additionalTRFetch"; m->m_desc = "maximum allowed value for " "additionalTRFetch parameter"; m->m_cgi = "ptrfrc"; m->m_off = (char *)&cr.m_refs_additionalTRFetchCeiling - x; m->m_type = TYPE_FLOAT; m->m_def = "10"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; // related pages parameters m->m_title = "number of related pages to generate"; m->m_desc = "number of related pages to generate."; m->m_cgi = "nrpg"; m->m_off = (char *)&cr.m_rp_numToGenerate - x; //m->m_soff = (char *)&si.m_rp_numToGenerate - y; m->m_smaxc = (char *)&cr.m_rp_numToGenerateCeiling - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_priv = 0; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of related pages to display"; m->m_desc = "number of related pages to display."; m->m_cgi = "nrpd"; m->m_off = (char *)&cr.m_rp_numToDisplay - x; //m->m_soff = (char *)&si.m_rp_numToDisplay - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_priv = 0; // allow the (more) link m->m_sprpg = 0; // do not propagate m->m_sprpp = 0; // do not propagate m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of links to scan for related pages"; m->m_desc = "number of links per reference page to scan for related " "pages."; m->m_cgi = "nlpd"; m->m_off = (char *)&cr.m_rp_numLinksPerDoc - x; //m->m_soff = (char *)&si.m_rp_numLinksPerDoc - y; m->m_smaxc = (char *)&cr.m_rp_numLinksPerDocCeiling - x; m->m_type = TYPE_LONG; m->m_def = "1024"; m->m_group = 0; m->m_priv = 2; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "min related page quality"; m->m_desc = "related pages with a quality lower than this will be " "ignored."; m->m_cgi = "merpq"; m->m_off = (char *)&cr.m_rp_minQuality - x; //m->m_soff = (char *)&si.m_rp_minQuality - y; m->m_type = TYPE_LONG; m->m_def = "30"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "min related page score"; m->m_desc = "related pages with an adjusted score lower than this " "will be ignored."; m->m_cgi = "merps"; m->m_off = (char *)&cr.m_rp_minScore - x; //m->m_soff = (char *)&si.m_rp_minScore - y; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "min related page links"; m->m_desc = "related pages with less than this number of links" " will be ignored."; m->m_cgi = "merpl"; m->m_off = (char *)&cr.m_rp_minLinks - x; //m->m_soff = (char *)&si.m_rp_minLinks - y; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "coefficient for number of links in related pages score " "calculation"; m->m_desc = "A in A * numLinks + B * avgLnkrQlty + C * PgQlty" " + D * numSRPLinks."; m->m_cgi = "nrplc"; m->m_off = (char *)&cr.m_rp_numLinksCoeff - x; //m->m_soff = (char *)&si.m_rp_numLinksCoeff - y; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "coefficient for average linker quality in related pages " "score calculation"; m->m_desc = "B in A * numLinks + B * avgLnkrQlty + C * PgQlty" " + D * numSRPLinks."; m->m_cgi = "arplqc"; m->m_off = (char *)&cr.m_rp_avgLnkrQualCoeff - x; //m->m_soff = (char *)&si.m_rp_avgLnkrQualCoeff - y; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "coefficient for page quality in related pages " "score calculation"; m->m_desc = "C in A * numLinks + B * avgLnkrQlty + C * PgQlty" " + D * numSRPLinks"; m->m_cgi = "qrpc"; m->m_off = (char *)&cr.m_rp_qualCoeff - x; //m->m_soff = (char *)&si.m_rp_qualCoeff - y; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "coefficient for search result links in related pages " "score calculation"; m->m_desc = "D in A * numLinks + B * avgLnkrQlty + C * PgQlty" " + D * numSRPLinks."; m->m_cgi = "srprpc"; m->m_off = (char *)&cr.m_rp_srpLinkCoeff - x; //m->m_soff = (char *)&si.m_rp_srpLinkCoeff - y; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of related page summary excerpts"; m->m_desc = "What is the maximum number of " "excerpts displayed in the summary of a related page?"; m->m_cgi = "nrps"; m->m_off = (char *)&cr.m_rp_numSummaryLines - x; //m->m_soff = (char *)&si.m_rp_numSummaryLines - y; m->m_smaxc = (char *)&cr.m_rp_numSummaryLinesCeiling - x; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_priv = 2; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "highlight query terms in related pages summary"; m->m_desc = "Highlight query terms in related pages summary."; m->m_cgi = "hqtirps"; m->m_off = (char *)&cr.m_rp_doRelatedPageSumHighlight - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of characters to display in title before " "truncating"; m->m_desc = "Truncates a related page title after this many " "characters and adds ..."; m->m_cgi = "ttl"; m->m_off = (char *)&cr.m_rp_titleTruncateLimit - x; //m->m_soff = (char *)&si.m_rp_titleTruncateLimit - y; m->m_type = TYPE_LONG; m->m_def = "50"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "use results pages as references"; m->m_desc = "Use the search results' links in order to generate " "related pages."; m->m_cgi = "urar"; m->m_off = (char *)&cr.m_rp_useResultsAsReferences - x; //m->m_soff = (char *)&si.m_rp_useResultsAsReferences - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "get related pages from other cluster"; m->m_desc = "Say yes here to make Gigablast check another Gigablast " "cluster for title rec for related pages. Gigablast will " "use the hosts2.conf file in the working directory to " "tell it what hosts belong to the other cluster."; m->m_cgi = "erp"; // external related pages m->m_off = (char *)&cr.m_rp_getExternalPages - x; //m->m_soff = (char *)&si.m_rp_getExternalPages - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "collection for other related pages cluster"; m->m_desc = "Gigablast will fetch the related pages title record " "from this collection in the other cluster."; m->m_cgi = "erpc"; // external related pages collection m->m_off = (char *)&cr.m_rp_externalColl - x; //m->m_soff = (char *)&si.m_rp_externalColl - y; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN; m->m_def = "main"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; // relate pages ceiling parameters m->m_title = "maximum allowed value for numToGenerate parameter"; m->m_desc = "maximum allowed value for numToGenerate parameter"; m->m_cgi = "nrpgc"; m->m_off = (char *)&cr.m_rp_numToGenerateCeiling - x; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "maximum allowed value for numRPLinksPerDoc parameter"; m->m_desc = "maximum allowed value for numRPLinksPerDoc parameter"; m->m_cgi = "nlpdc"; m->m_off = (char *)&cr.m_rp_numLinksPerDocCeiling - x; m->m_type = TYPE_LONG; m->m_def = "5000"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "maximum allowed value for numSummaryLines parameter"; m->m_desc = "maximum allowed value for numSummaryLines parameter"; m->m_cgi = "nrpsc"; m->m_off = (char *)&cr.m_rp_numSummaryLinesCeiling - x; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; // import search results controls m->m_title = "how many imported results should we insert"; m->m_desc = "Gigablast will import X search results from the " "external cluster given by hosts2.conf and merge those " "search results into the current set of search results. " "Set to 0 to disable."; m->m_cgi = "imp"; m->m_off = (char *)&cr.m_numResultsToImport - x; //m->m_soff = (char *)&si.m_numResultsToImport - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "imported score weight"; m->m_desc = "The score of all imported results will be multiplied " "by this number. Since results are mostly imported from " "a large collection they will usually have higher scores " "because of having more link texts or whatever, so tone it " "down a bit to put it on par with the integrating collection."; m->m_cgi = "impw"; m->m_off = (char *)&cr.m_importWeight - x; //m->m_soff = (char *)&si.m_importWeight - y; m->m_type = TYPE_FLOAT; m->m_def = ".80"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "how many linkers must each imported result have"; m->m_desc = "The urls of imported search results must be linked to " "by at least this many documents in the primary collection."; m->m_cgi = "impl"; m->m_off = (char *)&cr.m_minLinkersPerImportedResult - x; //m->m_soff = (char *)&si.m_minLinkersPerImportedResult - y; m->m_type = TYPE_LONG; m->m_def = "3"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "num linkers weight"; m->m_desc = "The number of linkers an imported result has from " "the base collection is multiplied by this weight and then " "added to the final score. The higher this is the more an " "imported result with a lot of linkers will be boosted. " "Currently, 100 is the max number of linkers permitted."; m->m_cgi = "impnlw"; m->m_off = (char *)&cr.m_numLinkerWeight - x; //m->m_soff = (char *)&si.m_numLinkerWeight - y; m->m_type = TYPE_LONG; m->m_def = "50"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "the name of the collection to import from"; m->m_desc = "Gigablast will import X search results from this " "external collection and merge them into the current search " "results."; m->m_cgi = "impc"; m->m_off = (char *)&cr.m_importColl - x; //m->m_soff = (char *)&si.m_importColl - y; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN; m->m_def = "main"; m->m_group = 0; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max similar results for cluster by topic"; m->m_desc = "Max similar results to show when clustering by topic."; m->m_cgi = "ncbt"; m->m_off = (char *)&cr.m_maxClusterByTopicResults - x; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_group = 0; //m->m_scgi = "ncbt"; //m->m_soff = (char *)&si.m_maxClusterByTopicResults - y; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of extra results to get for cluster by topic"; m->m_desc = "number of extra results to get for cluster by topic"; m->m_cgi = "ntwo"; m->m_off = (char *)&cr.m_numExtraClusterByTopicResults - x; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; //m->m_scgi = "ntwo"; //m->m_soff = (char *)&si.m_numExtraClusterByTopicResults - y; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "Minimum number of in linkers required to consider getting" " the title from in linkers"; m->m_desc = "Minimum number of in linkers required to consider getting" " the title from in linkers"; m->m_cgi = "mininlinkers"; m->m_off = (char *)&cr.m_minTitleInLinkers - x; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "Max number of in linkers to consider"; m->m_desc = "Max number of in linkers to consider for getting in " "linkers titles."; m->m_cgi = "maxinlinkers"; m->m_off = (char *)&cr.m_maxTitleInLinkers - x; m->m_type = TYPE_LONG; m->m_def = "128"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max title len"; m->m_desc = "What is the maximum number of " "characters allowed in titles displayed in the search " "results?"; m->m_cgi = "tml"; m->m_defOff= (char *)&cr.m_titleMaxLen - x; m->m_off = (char *)&si.m_titleMaxLen - y; m->m_type = TYPE_LONG; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; /* m->m_title = "use new summary generator"; m->m_desc = "Also used for gigabits and titles."; m->m_cgi = "uns"; // external related pages m->m_off = (char *)&cr.m_useNewSummaries - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_sparm = 1; m->m_scgi = "uns"; m->m_soff = (char *)&si.m_useNewSummaries - y; m++; */ m->m_title = "summary mode"; m->m_desc = "0 = old compatibility mode, 1 = UTF-8 mode, " "2 = fast ASCII mode, " "3 = Ascii Proximity Summary, " "4 = Utf8 Proximity Summary, " "5 = Ascii Pre Proximity Summary, " "6 = Utf8 Pre Proximity Summary:"; m->m_cgi = "smd"; m->m_off = (char *)&cr.m_summaryMode - x; m->m_type = TYPE_LONG; m->m_def = "0"; //m->m_scgi = "smd"; //m->m_soff = (char*) &si.m_summaryMode - y; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of summary excerpts"; m->m_desc = "How many summary excerpts to display per search result?"; m->m_cgi = "ns"; m->m_type = TYPE_LONG; m->m_defOff= (char *)&cr.m_summaryMaxNumLines - x; m->m_group = 0; m->m_off = (char *)&si.m_numLinesInSummary - y; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "max summary line width"; m->m_desc = "<br> tags are inserted to keep the number " "of chars in the summary per line at or below this width. " "Also affects title. " "Strings without spaces that exceed this " "width are not split. Has no affect on xml or json feed, " "only works on html."; m->m_cgi = "sw"; //m->m_off = (char *)&cr.m_summaryMaxWidth - x; m->m_off = (char *)&si.m_summaryMaxWidth - y; m->m_defOff= (char *)&cr.m_summaryMaxWidth - x; m->m_type = TYPE_LONG; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "max summary excerpt length"; m->m_desc = "What is the maximum number of " "characters allowed per summary excerpt?"; m->m_cgi = "smxcpl"; m->m_off = (char *)&si.m_summaryMaxNumCharsPerLine - y; m->m_defOff= (char *)&cr.m_summaryMaxNumCharsPerLine - x; m->m_type = TYPE_LONG; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; /* m->m_title = "enable page turk"; m->m_desc = "If enabled, search results shall feed the page turk " "is used to mechanically rank websites."; m->m_cgi = "ept"; m->m_def = "0"; m->m_off = (char *)&cr.m_pageTurkEnabled - x; m->m_type = TYPE_BOOL; m++; */ m->m_title = "results to scan for gigabits generation"; m->m_desc = "How many search results should we " "scan for gigabit (related topics) generation. Set this to " "zero to disable gigabits!"; m->m_cgi = "dsrt"; m->m_off = (char *)&si.m_docsToScanForTopics - y; m->m_type = TYPE_LONG; m->m_defOff= (char *)&cr.m_docsToScanForTopics - x; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "ip restriction for gigabits"; m->m_desc = "Should Gigablast only get one document per IP domain " "and per domain for gigabits (related topics) generation?"; m->m_cgi = "ipr"; m->m_off = (char *)&si.m_ipRestrictForTopics - y; m->m_defOff= (char *)&cr.m_ipRestrict - x; m->m_type = TYPE_BOOL; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "number of gigabits to show"; m->m_desc = "What is the number of gigabits (related topics) " "displayed per query? Set to 0 to save a little CPU time."; m->m_cgi = "nrt"; m->m_defOff= (char *)&cr.m_numTopics - x; m->m_off = (char *)&si.m_numTopicsToDisplay - y; m->m_type = TYPE_LONG; m->m_def = "11"; m->m_group = 0; m->m_sprpg = 0; // do not propagate m->m_sprpp = 0; // do not propagate m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "min topics score"; m->m_desc = "Gigabits (related topics) with scores below this " "will be excluded. Scores range from 0% to over 100%."; m->m_cgi = "mts"; m->m_defOff= (char *)&cr.m_minTopicScore - x; m->m_off = (char *)&si.m_minTopicScore - y; m->m_type = TYPE_LONG; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "min gigabit doc count by default"; m->m_desc = "How many documents must contain the gigabit " "(related topic) in order for it to be displayed."; m->m_cgi = "mdc"; m->m_defOff= (char *)&cr.m_minDocCount - x; m->m_off = (char *)&si.m_minDocCount - y; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "dedup doc percent for gigabits (related topics)"; m->m_desc = "If a document is this percent similar to another " "document with a higher score, then it will not contribute " "to the gigabit generation."; m->m_cgi = "dsp"; m->m_defOff= (char *)&cr.m_dedupSamplePercent - x; m->m_off = (char *)&si.m_dedupSamplePercent - y; m->m_type = TYPE_LONG; m->m_def = "80"; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; /////////////////////////////////////////// // SPIDER PROXY CONTROLS // /////////////////////////////////////////// m->m_title = "always use spider proxies for all collections"; m->m_desc = "ALWAYS Use the spider proxies listed below for " "spidering. If none are " "listed then gb will not use any. Applies to all collections. " "If you want to regulate this on a per collection basis then " "set this to NO here and adjust the " "proxy controls on the " "spider controls page. If the list of proxy IPs below " "is empty, then of course, no proxies will be used."; m->m_cgi = "useproxyips"; m->m_xml = "useSpiderProxies"; m->m_off = (char *)&g_conf.m_useProxyIps - g; m->m_type = TYPE_BOOL; m->m_def = "0"; // hide this for now. just make it a per collection parm. m->m_flags = PF_HIDDEN; m->m_page = PAGE_SPIDERPROXIES; m->m_obj = OBJ_CONF; m++; m->m_title = "automatically use spider proxies for all collections"; m->m_desc = "AUTOMATICALLY use the spider proxies listed below for " "spidering. If none are " "listed then gb will not use any. Applies to all collections. " "If you want to regulate this on a per collection basis then " "set this to NO here and adjust the " "proxy controls on the " "spider controls page. If the list of proxy IPs below " "is empty, then of course, no proxies will be used."; m->m_cgi = "autouseproxyips"; m->m_xml = "automaticallyUseSpiderProxies"; m->m_off = (char *)&g_conf.m_automaticallyUseProxyIps - g; m->m_type = TYPE_BOOL; m->m_def = "0"; // hide this for now. just make it a per collection parm. m->m_flags = PF_HIDDEN; m->m_page = PAGE_SPIDERPROXIES; m->m_obj = OBJ_CONF; m++; m->m_title = "spider proxy ips"; m->m_desc = "List of white space-separated spider proxy IPs. Put " "in IP:port format. Example 1.2.3.4:80 4.5.6.7:99. " "You can also use username:password@1.2.3.4:80. " "If a proxy itself times out when downloading through it " "it will be perceived as a normal download timeout and the " "page will be retried according to the url filters table, so " "you might want to modify the url filters to retry network " "errors more aggressively. Search for 'private proxies' on " "google to find proxy providers. Try to ensure all your " "proxies are on different class C IPs if possible. " "That is, the first 3 numbers in the IP addresses are all " "different."; m->m_cgi = "proxyips"; m->m_xml = "proxyIps"; m->m_off = (char *)&g_conf.m_proxyIps - g; m->m_type = TYPE_SAFEBUF; // TYPE_IP; m->m_def = ""; m->m_flags = PF_TEXTAREA | PF_REBUILDPROXYTABLE; m->m_page = PAGE_SPIDERPROXIES; m->m_obj = OBJ_CONF; m++; m->m_title = "spider proxy test url"; m->m_desc = "Download this url every minute through each proxy " "listed above to ensure they are up. Typically you should " "make this a URL you own so you do not aggravate another " "webmaster."; m->m_xml = "proxyTestUrl"; m->m_cgi = "proxytesturl"; m->m_off = (char *)&g_conf.m_proxyTestUrl - g; m->m_type = TYPE_SAFEBUF; m->m_def = "http://www.gigablast.com/"; m->m_flags = 0; m->m_page = PAGE_SPIDERPROXIES; m->m_obj = OBJ_CONF; m++; m->m_title = "reset proxy table"; m->m_desc = "Reset the proxy statistics in the table below. Makes " "all your proxies treated like new again."; m->m_cgi = "resetproxytable"; m->m_type = TYPE_CMD; m->m_func = CommandResetProxyTable; m->m_cast = 1; m->m_page = PAGE_SPIDERPROXIES; m->m_obj = OBJ_CONF; m++; m->m_title = "mix up user agents"; m->m_desc = "Use random user-agents when downloading through " "a spider proxy listed above to " "protecting gb's anonymity. The User-Agent used is a function " "of the proxy IP/port and IP of the url being downloaded. " "That way it is consistent when downloading the same website " "through the same proxy."; m->m_cgi = "userandagents"; m->m_xml = "useRandAgents"; m->m_off = (char *)&g_conf.m_useRandAgents - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = 0; m->m_page = PAGE_SPIDERPROXIES; m->m_obj = OBJ_CONF; m++; m->m_title = "squid proxy authorized users"; m->m_desc = "Gigablast can also simulate a squid proxy, " "complete with " "caching. It will forward your request to the proxies you " "list above, if any. This list consists of space-separated " "username:password items. Leave this list empty " "to disable squid caching behaviour. The default cache " "size for this is 10MB per shard. Use item *:* to allow " "anyone access."; m->m_xml = "proxyAuth"; m->m_cgi = "proxyAuth"; m->m_off = (char *)&g_conf.m_proxyAuth - g; m->m_type = TYPE_SAFEBUF; m->m_def = ""; m->m_flags = PF_TEXTAREA; m->m_page = PAGE_SPIDERPROXIES; m->m_obj = OBJ_CONF; m++; m->m_title = "max words per gigabit (related topic) by default"; m->m_desc = "Maximum number of words a gigabit (related topic) " "can have. Affects xml feeds, too."; m->m_cgi = "mwpt"; m->m_defOff= (char *)&cr.m_maxWordsPerTopic - x; m->m_off = (char *)&si.m_maxWordsPerTopic - y; m->m_type = TYPE_LONG; m->m_def = "6"; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "show images"; m->m_desc = "Should we return or show the thumbnail images in the " "search results?"; m->m_cgi = "showimages"; m->m_off = (char *)&si.m_showImages - y; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_sprpg = 0; m->m_sprpp = 0; m->m_flags = PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "use cache"; m->m_desc = "Use 0 if Gigablast should not read or write from " "any caches at any level."; m->m_def = "-1"; m->m_off = (char *)&si.m_useCache - y; m->m_type = TYPE_CHAR; m->m_cgi = "usecache"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "read from cache"; m->m_desc = "Should we read search results from the cache? Set " "to false to fix dmoz bug."; m->m_cgi = "rcache"; m->m_off = (char *)&si.m_rcache - y; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_sprpg = 0; m->m_sprpp = 0; m->m_flags = PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "write to cache"; m->m_desc = "Use 0 if Gigablast should not write to " "any caches at any level."; m->m_def = "-1"; m->m_off = (char *)&si.m_wcache - y; m->m_type = TYPE_CHAR; m->m_cgi = "wcache"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "max serp docid"; m->m_desc = "Start displaying results after this score/docid pair. " "Used by widget to append results to end when index is " "volatile."; m->m_def = "0"; m->m_off = (char *)&si.m_minSerpDocId - y; m->m_type = TYPE_LONG_LONG; m->m_cgi = "minserpdocid"; m->m_flags = PF_API; m->m_smin = 0; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "max serp score"; m->m_desc = "Start displaying results after this score/docid pair. " "Used by widget to append results to end when index is " "volatile."; m->m_def = "0"; m->m_off = (char *)&si.m_maxSerpScore - y; m->m_type = TYPE_DOUBLE; m->m_cgi = "maxserpscore"; m->m_flags = PF_API; m->m_smin = 0; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "restrict search to this url"; m->m_desc = "Does a url: query."; m->m_off = (char *)&si.m_url - y; m->m_type = TYPE_CHARPTR;//STRING; //m->m_size = MAX_URL_LEN; m->m_cgi = "url"; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; m->m_title = "restrict search to pages that link to this url"; m->m_desc = "The url which the pages must link to."; m->m_off = (char *)&si.m_link - y; m->m_type = TYPE_CHARPTR;//STRING; //m->m_size = MAX_URL_LEN; m->m_cgi = "link"; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "search for this phrase quoted"; m->m_desc = "The phrase which will be quoted in the query. From the " "advanced search page, adv.html."; m->m_off = (char *)&si.m_quote1 - y; m->m_type = TYPE_CHARPTR;//STRING; //m->m_size = 512; m->m_cgi = "quotea"; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; m->m_title = "search for this second phrase quoted"; m->m_desc = "The phrase which will be quoted in the query. From the " "advanced search page, adv.html."; m->m_off = (char *)&si.m_quote2 - y; m->m_type = TYPE_CHARPTR;//STRING; //m->m_size = 512; m->m_cgi = "quoteb"; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; /* m->m_title = "restrict results to this site"; m->m_desc = "Returned results will have URLs from this site, X."; m->m_off = (char *)&si.m_site - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_cgi = "site"; m->m_size = 1024; // MAX_SITE_LEN; m->m_sprpg = 1; m->m_sprpp = 1; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; */ m->m_title = "restrict results to these sites"; m->m_desc = "Returned results will have URLs from these " "space-separated list of sites. Can have up to 200 sites. " "A site can include sub folders. This is allows you to build " "a Custom Topic Search Engine."; m->m_off = (char *)&si.m_sites - y; m->m_type = TYPE_CHARPTR; //m->m_size = 32*1024; // MAX_SITES_LEN; m->m_cgi = "sites"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_sprpg = 1; m->m_sprpp = 1; m++; m->m_title = "require these query terms"; m->m_desc = "Returned results will have all the words in X. " "From the advanced search page, adv.html."; m->m_off = (char *)&si.m_plus - y; m->m_def = NULL; m->m_type = TYPE_CHARPTR;//STRING; m->m_cgi = "plus"; //m->m_size = 500; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; m->m_title = "avoid these query terms"; m->m_desc = "Returned results will NOT have any of the words in X. " "From the advanced search page, adv.html."; m->m_off = (char *)&si.m_minus - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_cgi = "minus"; //m->m_size = 500; m->m_sprpg = 0; m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; m->m_title = "format of the returned search results"; m->m_desc = "Can be html, xml or json to get results back in that " "format."; m->m_def = "html"; m->m_off = (char *)&si.m_formatStr - y; m->m_type = TYPE_CHARPTR; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_cgi = "format"; m->m_flags = PF_NOAPI; // already in the api, so don't repeat m++; m->m_title = "family filter"; m->m_desc = "Remove objectionable results if this is enabled."; m->m_def = "0"; m->m_off = (char *)&si.m_familyFilter - y; m->m_type = TYPE_BOOL; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_cgi = "ff"; m++; m->m_title = "highlight query terms in summaries"; m->m_desc = "Use to disable or enable " "highlighting of the query terms in the summaries."; m->m_def = "1"; m->m_off = (char *)&si.m_doQueryHighlighting - y; m->m_type = TYPE_BOOL; m->m_cgi = "qh"; m->m_smin = 0; m->m_smax = 8; m->m_sprpg = 1; // turn off for now m->m_sprpp = 1; m->m_flags = PF_API; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "cached page highlight query"; m->m_desc = "Highlight the terms in this query instead."; m->m_def = NULL; m->m_off = (char *)&si.m_highlightQuery - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_cgi = "hq"; //m->m_size = 1000; m->m_sprpg = 0; // no need to propagate this one m->m_sprpp = 0; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; /* m->m_title = "highlight event date in summaries."; m->m_desc = "Can be 0 or 1 to respectively disable or enable " "highlighting of the event date terms in the summaries."; m->m_def = "0"; m->m_off = (char *)&si.m_doDateHighlighting - y; m->m_type = TYPE_BOOL; m->m_cgi = "dh"; m->m_smin = 0; m->m_smax = 8; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; */ /* m->m_title = "limit search results to this ruleset"; m->m_desc = "limit search results to this ruleset"; m->m_def = "0"; m->m_off = (char *)&si.m_ruleset - y; m->m_type = TYPE_LONG; m->m_cgi = "ruleset"; m->m_smin = 0; m++; */ m->m_title = "Query match offsets"; m->m_desc = "Return a list of the offsets of each query word " "actually matched in the document. 1 means byte offset, " "and 2 means word offset."; m->m_def = "0"; m->m_off = (char *)&si.m_queryMatchOffsets - y; m->m_type = TYPE_LONG; m->m_cgi = "qmo"; m->m_smin = 0; m->m_smax = 2; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; m->m_title = "boolean status"; m->m_desc = "Can be 0 or 1 or 2. 0 means the query is NOT boolean, " "1 means the query is boolean and 2 means to auto-detect."; m->m_def = "2"; m->m_off = (char *)&si.m_boolFlag - y; m->m_type = TYPE_LONG; m->m_cgi = "bq"; m->m_smin = 0; m->m_smax = 2; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "meta tags to display"; m->m_desc = "A space-separated string of meta tag names. " "Do not forget to url-encode the spaces to +'s or %%20's. " "Gigablast will extract the contents of these specified meta " "tags out of the pages listed in the search results and " "display that content after each summary. i.e. " "&dt=description will display the meta description of " "each search result. &dt=description:32+keywords:64 " "will display the meta description and meta keywords of each " "search result and limit the fields to 32 and 64 characters " "respectively. When used in an XML feed the <display " "name=\"meta_tag_name\">meta_tag_content</> XML " "tag will be used to convey each requested meta tag's " "content."; m->m_off = (char *)&si.m_displayMetas - y; m->m_type = TYPE_CHARPTR; m->m_cgi = "dt"; //m->m_size = 3000; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; /* // . you can have multiple topics= parms in you query url... // . this is used to set the TopicGroups array in SearchInput m->m_title = "related topic parameters"; m->m_desc = "X=NUM+MAX+SCAN+MIN+MAXW+META+DEL+IDF+DEDUP\n" "

\n" "NUM is how many related topics you want " "returned.\n" "

\n" "MAX is the maximum number of topics to generate " "and store in cache, so if TW is increased, but still below " "MT, it will result in a fast cache hit.\n" "

\n" "SCAN is how many documents to scan for related " "topics. If this is 30, for example, then Gigablast will " "scan the first 30 search results for related topics.\n" "

\n" "MIN is the minimum score of returned topics. Ranges " "from 0%% to over 100%%. 50%% is considered pretty good. " "BUG: This must be at least 1 to get any topics back.\n" "

\n" "MAXW is the maximum number of words per topic.\n" "

\n" "META is the meta tag name to which Gigablast will " "restrict the content used to generate the topics. Do not " "specify this field to restrict the content to the body of " "each document, that is the default.\n" "

\n" "DEL is a single character delimeter which defines " "the topic candidates. All candidates must be separated from " "the other candidates with the delimeter. So <meta " "name=test content=\" cat dog ; pig rabbit horse\"> " "when using the ; as a delimeter would only have two topic " "candidates: \"cat dog\" and \"pig rabbit horse\". If no " "delimeter is provided, default funcationality is assumed.\n" "

\n" "IDF is 1, the default, if you want Gigablast to " "weight topic candidates by their idf, 0 otherwise." "

\n" "DEDUP is 1, the default, if the topics should be " "deduped. This involves removing topics that are substrings " "or superstrings of other higher-scoring topics." "

\n" "Example: topics=49+100+30+1+6+author+%%3B+0+0" "

\n" "The default values for those parameters with unspecifed " "defaults can be defined on the \"Search Controls\" page. " "

\n" "XML feeds will contain the generated topics like: " "<topic><name><![CDATA[some topic]]><" "/name><score>13</score><from>" "metaTagName</from></topic>" "

\n" "Even though somewhat nonstandard, you can specify multiple " "&topic= parameters to get back multiple topic " "groups." "

\n" "Performance will decrease if you increase the MAX, SCAN or " "MAXW."; m->m_type = TYPE_STRING; m->m_size = 512; m->m_cgi = "topics"; m->m_size = 100; // MDW: NO NO NO... was causing a write breach!!! -- take this all out m->m_off = -2; // bogus offset //m->m_off = (char *)&si.m_topics - y; m++; */ m->m_title = "niceness"; m->m_desc = "Can be 0 or 1. 0 is usually a faster, high-priority " "query, 1 is a slower, lower-priority query."; m->m_def = "0"; m->m_off = (char *)&si.m_niceness - y; m->m_type = TYPE_LONG; m->m_cgi = "niceness"; m->m_smin = 0; m->m_smax = 1; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "debug flag"; m->m_desc = "Is 1 to log debug information, 0 otherwise."; m->m_def = "0"; m->m_off = (char *)&si.m_debug - y; m->m_type = TYPE_BOOL; m->m_cgi = "debug"; //m->m_priv = 1; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "return number of docs per topic"; m->m_desc = "Use 1 if you want Gigablast to return the number of " "documents in the search results that contained each topic " "(gigabit)."; m->m_def = "1"; m->m_off = (char *)&si.m_returnDocIdCount - y; m->m_type = TYPE_BOOL; m->m_cgi = "rdc"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "return docids per topic"; m->m_desc = "Use 1 if you want Gigablast to return the list of " "docIds from the search results that contained each topic " "(gigabit)."; m->m_def = "0"; m->m_off = (char *)&si.m_returnDocIds - y; m->m_type = TYPE_BOOL; m->m_cgi = "rd"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "return popularity per topic"; m->m_desc = "Use 1 if you want Gigablast to return the popularity " "of each topic (gigabit)."; m->m_def = "0"; m->m_off = (char *)&si.m_returnPops - y; m->m_type = TYPE_BOOL; m->m_cgi = "rp"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; //m->m_title = "compound list max size"; //m->m_desc = "Is the max size in bytes of the compound termlist. " // "Each document id is 6 bytes."; //m->m_def = "-1"; //m->m_off = (char *)&si.m_compoundListMaxSize - y; //m->m_type = TYPE_LONG; //m->m_cgi = "clms"; //m->m_smin = 0; //m->m_priv = 1; //m++; m->m_title = "debug gigabits flag"; m->m_desc = "Is 1 to log gigabits debug information, 0 otherwise."; m->m_def = "0"; m->m_off = (char *)&si.m_debugGigabits - y; m->m_type = TYPE_BOOL; m->m_cgi = "debuggigabits"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "return docids only"; m->m_desc = "Is 1 to return only docids as query results."; m->m_def = "0"; m->m_off = (char *)&si.m_docIdsOnly - y; m->m_type = TYPE_BOOL; m->m_cgi = "dio"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "image url"; m->m_desc = "The url of an image to co-brand on the search " "results page."; m->m_off = (char *)&si.m_imgUrl - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_def = NULL; //m->m_size = 512; m->m_cgi = "iu"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; m->m_title = "image link"; m->m_desc = "The hyperlink to use on the image to co-brand on " "the search results page."; m->m_off = (char *)&si.m_imgLink - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_def = NULL; //m->m_size = 512; m->m_cgi = "ix"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; m->m_title = "image width"; m->m_desc = "The width of the image on the search results page."; m->m_off = (char *)&si.m_imgWidth - y; m->m_type = TYPE_LONG; m->m_cgi = "iw"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_def = "200"; m->m_flags = PF_NOAPI; m++; m->m_title = "image height"; m->m_desc = "The height of the image on the search results " "page."; m->m_off = (char *)&si.m_imgHeight - y; m->m_type = TYPE_LONG; m->m_cgi = "ih"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_def = "200"; m->m_flags = PF_NOAPI; m++; // m->m_title = "password"; // m->m_desc = "The password."; // m->m_off = (char *)&si.m_pwd - y; // m->m_type = TYPE_CHARPTR;//STRING; // m->m_cgi = "pwd"; // m->m_size = 32; // m->m_flags = PF_HIDDEN | PF_NOSAVE; // m->m_page = PAGE_RESULTS; // m->m_obj = OBJ_SI; // m++; m->m_title = "admin override"; m->m_desc = "admin override"; m->m_off = (char *)&si.m_isMasterAdmin - y; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_cgi = "admin"; m->m_sprpg = 1; // propagate on GET request m->m_sprpp = 1; // propagate on POST request m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; /* m->m_title = "language"; m->m_desc = "Language code to restrict search. 0 = All. Uses " "Clusterdb to filter languages. This is being phased out " "please do not use much, use gblang instead."; m->m_off = (char *)&si.m_languageCode - y; m->m_type = TYPE_STRING; m->m_size = 5+1; m->m_def = "none"; // our google gadget gets &lang=en passed to it from google, so // change this!! m->m_cgi = "clang"; m++; */ /* this should be a hash on the lang abbr line gblang:en m->m_title = "GB language"; m->m_desc = "Language code to restrict search. 0 = All. Uses " "the gblang: keyword to filter languages."; m->m_off = (char *)&si.m_gblang - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_cgi = "gblang"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; */ // prepend to query m->m_title = "prepend"; m->m_desc = "prepend this to the supplied query followed by a |."; m->m_off = (char *)&si.m_prepend - y; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_cgi = "prepend"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "GB Country"; m->m_desc = "Country code to restrict search"; m->m_off = (char *)&si.m_gbcountry - y; m->m_type = TYPE_CHARPTR;//STRING; //m->m_size = 4+1; m->m_def = NULL; //m->m_def = "iso-8859-1"; m->m_cgi = "gbcountry"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; /* m->m_title = "rerank ruleset"; m->m_desc = "Use this ruleset to rerank the search results. Will " "rerank at least the first X results specified with &n=X. " "And be sure to say &recycle=0 to recompute the quality " "of each page in the search results."; m->m_off = (char *)&si.m_rerankRuleset - y; m->m_type = TYPE_LONG; m->m_def = "-1"; m->m_cgi = "rerank"; m++; m->m_title = "apply ruleset to roots"; m->m_desc = "Recompute the quality of the root urls of each " "search result in order to compute the quality of that " "search result, since it depends on its root quality. This " "can take a lot longer when enabled."; m->m_off = (char *)&si.m_artr - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_cgi = "artr"; m++; */ m->m_title = "show banned pages"; m->m_desc = "show banned pages"; m->m_off = (char *)&si.m_showBanned - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_cgi = "sb"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "allow punctuation in query phrases"; m->m_desc = "allow punctuation in query phrases"; m->m_off = (char *)&si.m_allowPunctInPhrase - y; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_cgi = "apip"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; /* m->m_title = "use ad feed num"; m->m_desc = "use ad feed num"; m->m_off = (char *)&si.m_useAdFeedNum - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_cgi = "uafn"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; */ /* m->m_title = "do bot detection"; m->m_desc = "Passed in for raw feeds that want bot detection cgi " "parameters passed back in the XML."; m->m_off = (char *)&si.m_doBotDetection - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_cgi = "bd"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; */ /* m->m_title = "bot detection query"; m->m_desc = "Passed in for raw feeds that want bot detection cgi " "parameters passed back in the XML. Use this variable " "when an actual query against gigablast is not needed " "(i.e. - image/video/news searches)."; m->m_off = (char *)&si.m_botDetectionQuery - y; m->m_type = TYPE_CHARPTR;//STRING; m->m_cgi = "bdq"; m->m_def = NULL; m->m_size = MAX_QUERY_LEN; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; */ m->m_title = "queryCharset"; m->m_desc = "Charset in which the query is encoded"; m->m_off = (char *)&si.m_queryCharset - y; m->m_type = TYPE_CHARPTR;//STRING; //m->m_size = 32+1; m->m_def = "utf-8"; //m->m_def = "iso-8859-1"; m->m_cgi = "qcs"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; // buzz m->m_title = "display inlinks"; m->m_desc = "Display all inlinks of each result."; m->m_off = (char *)&si.m_displayInlinks - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_cgi = "inlinks"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; // buzz m->m_title = "display outlinks"; m->m_desc = "Display all outlinks of each result. outlinks=1 " "displays only external outlinks. outlinks=2 displays " "external and internal outlinks."; m->m_off = (char *)&si.m_displayOutlinks - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_cgi = "outlinks"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_NOAPI; m++; // buzz m->m_title = "display term frequencies"; m->m_desc = "Display Terms and Frequencies in results."; m->m_off = (char *)&si.m_displayTermFreqs - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_cgi = "tf"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; // buzz m->m_title = "spider results"; m->m_desc = "Results of this query will be forced into the spider " "queue for reindexing."; m->m_off = (char *)&si.m_spiderResults - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_cgi = "spiderresults"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; // buzz m->m_title = "spider result roots"; m->m_desc = "Root urls of the results of this query will be forced " "into the spider queue for reindexing."; m->m_off = (char *)&si.m_spiderResultRoots - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_cgi = "spiderresultroots"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; // buzz m->m_title = "just mark clusterlevels"; m->m_desc = "Check for deduping, but just mark the cluster levels " "and the doc deduped against, don't remove the result."; m->m_off = (char *)&si.m_justMarkClusterLevels - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_cgi = "jmcl"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m++; m->m_title = "include cached copy of page"; m->m_desc = "Will cause a cached copy of content to be returned " "instead of summary."; m->m_off = (char *)&si.m_includeCachedCopy - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_cgi = "icc"; m->m_page = PAGE_RESULTS; m->m_obj = OBJ_SI; m->m_flags = PF_API; m++; // m->m_title = "get section voting info in json"; // m->m_desc = "Will cause section voting info to be returned."; // m->m_off = (char *)&si.m_getSectionVotingInfo - y; // m->m_type = TYPE_CHAR; // m->m_def = "0"; // m->m_cgi = "sectionvotes"; // m->m_page = PAGE_RESULTS; // m->m_obj = OBJ_SI; // m->m_flags = PF_API; // m++; ////////////// // END /search ////////////// ////////// // PAGE GET (cached web pages) /////////// m->m_title = "docId"; m->m_desc = "The docid of the cached page to view."; m->m_off = (char *)&gr.m_docId - (char *)&gr; m->m_type = TYPE_LONG_LONG; m->m_page = PAGE_GET; m->m_obj = OBJ_GBREQUEST; // generic request class m->m_def = "0"; m->m_cgi = "d"; m->m_flags = PF_API | PF_REQUIRED; m++; m->m_title = "url"; m->m_desc = "Instead of specifying a docid, you can get the " "cached webpage by url as well."; m->m_off = (char *)&gr.m_url - (char *)&gr; m->m_type = TYPE_CHARPTR; // reference into the HttpRequest m->m_page = PAGE_GET; m->m_obj = OBJ_GBREQUEST; // generic request class m->m_def = NULL; m->m_cgi = "url"; m->m_flags = PF_API | PF_REQUIRED; m++; m->m_title = "collection"; m->m_desc = "Get the cached page from this collection."; m->m_cgi = "c"; m->m_page = PAGE_GET; m->m_obj = OBJ_GBREQUEST; m->m_off = (char *)&gr.m_coll - (char *)&gr; m->m_type = TYPE_CHARPTR;//SAFEBUF; m->m_def = NULL; m->m_flags = PF_REQUIRED | PF_API; m++; m->m_title = "strip"; m->m_desc = "Is 1 or 2 two strip various tags from the " "cached content."; m->m_off = (char *)&gr.m_strip - (char *)&gr; m->m_page = PAGE_GET; m->m_obj = OBJ_GBREQUEST; m->m_cgi = "strip"; m->m_def = "0"; m->m_type = TYPE_LONG; m->m_flags = PF_API; m++; m->m_title = "include header"; m->m_desc = "Is 1 to include the Gigablast header at the top of " "the cached page, 0 to exclude the header."; m->m_def = "1"; m->m_type = TYPE_BOOL; m->m_page = PAGE_GET; m->m_obj = OBJ_GBREQUEST; m->m_cgi = "ih"; m->m_off = (char *)&gr.m_includeHeader - (char *)&gr; m->m_flags = PF_API; m++; m->m_title = "query"; m->m_desc = "Highlight this query in the page."; m->m_def = ""; m->m_type = TYPE_CHARPTR; m->m_page = PAGE_GET; m->m_obj = OBJ_GBREQUEST; m->m_cgi = "q"; m->m_off = (char *)&gr.m_query - (char *)&gr; m->m_flags = PF_API; m++; /* // for /get m->m_title = "query highlighting query"; m->m_desc = "Is 1 to highlight query terms in the cached page."; m->m_def = "1"; m->m_type = TYPE_BOOL; m->m_cgi = "qh"; m->m_off = (char *)&si.m_queryHighlighting - y; m++; */ // for /addurl /* m->m_title = "url to add"; m->m_desc = "Used by add url page."; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_cgi = "u"; m->m_off = (char *)&si.m_url2 - y; m++; */ // Process.cpp calls Msg28::massConfig with &haspower=[0|1] to // indicate power loss or coming back on from a power loss m->m_title = "power on status notification"; m->m_desc = "Indicates power is back on."; m->m_cgi = "poweron"; m->m_type = TYPE_CMD; m->m_func = CommandPowerOnNotice; m->m_cast = 0; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; m->m_title = "power off status notification"; m->m_desc = "Indicates power is off."; m->m_cgi = "poweroff"; m->m_type = TYPE_CMD; m->m_func = CommandPowerOffNotice; m->m_cast = 0; m->m_page = PAGE_NONE; m->m_obj = OBJ_CONF; m++; ////////////// // END PAGE_GET ////////////// /////////////////////////////////////////// // MASTER CONTROLS /////////////////////////////////////////// m->m_title = "spidering enabled"; m->m_desc = "Controls all spidering for all collections"; m->m_cgi = "se"; m->m_off = (char *)&g_conf.m_spideringEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "1"; //m->m_cast = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "injections enabled"; m->m_desc = "Controls injecting for all collections"; m->m_cgi = "injen"; m->m_off = (char *)&g_conf.m_injectionsEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "querying enabled"; m->m_desc = "Controls querying for all collections"; m->m_cgi = "qryen"; m->m_off = (char *)&g_conf.m_queryingEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "return results even if a shard is down"; m->m_desc = "If you turn this off then Gigablast will return " "an error message if a shard was down and did not return " "results for a query. The XML and JSON feed let's you know " "when a shard is down and will give you the results back " "any way, but if you would rather have just and error message " "and no results, then set then set this to 'NO'."; m->m_cgi = "rra"; m->m_off = (char *)&g_conf.m_returnResultsAnyway - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max mem"; m->m_desc = "Mem available to this process. May be exceeded due " "to fragmentation."; m->m_cgi = "maxmem"; m->m_off = (char *)&g_conf.m_maxMem - g; m->m_def = "8000000000"; m->m_obj = OBJ_CONF; m->m_page = PAGE_MASTER; // PAGE_NONE; m->m_type = TYPE_LONG_LONG; //m->m_flags = PF_NOAPI; m++; m->m_title = "max total spiders"; m->m_desc = "What is the maximum number of web " "pages the spider is allowed to download " "simultaneously for ALL collections PER HOST? Caution: " "raising this too high could result in some Out of Memory " "(OOM) errors. The hard limit is currently 300. Each " "collection has its own limit in the spider controls " "that you may have to increase as well."; m->m_cgi = "mtsp"; m->m_off = (char *)&g_conf.m_maxTotalSpiders - g; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "web spidering enabled"; m->m_desc = "Spiders events on web"; m->m_cgi = "wse"; m->m_off = (char *)&g_conf.m_webSpideringEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ m->m_title = "add url enabled"; m->m_desc = "Can people use the add url interface to add urls " "to the index?"; m->m_cgi = "ae"; m->m_off = (char *)&g_conf.m_addUrlEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "1"; //m->m_cast = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "use collection passwords"; m->m_desc = "Should collections have individual password settings " "so different users can administrer different collections? " "If not the only the master passwords and IPs will be able " "to administer any collection."; m->m_cgi = "ucp"; m->m_off = (char *)&g_conf.m_useCollectionPasswords - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "allow cloud users"; m->m_desc = "Can guest users create and administer " "a collection? Limit: 1 " "collection per IP address. This is mainly for doing " "demos on the gigablast.com domain."; m->m_cgi = "acu"; m->m_off = (char *)&g_conf.m_allowCloudUsers - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "auto save frequency"; m->m_desc = "Save data in memory to disk after this many minutes " "have passed without the data having been dumped or saved " "to disk. Use 0 to disable."; m->m_cgi = "asf"; m->m_off = (char *)&g_conf.m_autoSaveFrequency - g; m->m_type = TYPE_LONG; m->m_def = "5"; m->m_units = "mins"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max http sockets"; m->m_desc = "Maximum sockets available to serve incoming HTTP " "requests. Too many outstanding requests will increase " "query latency. Excess requests will simply have their " "sockets closed."; m->m_cgi = "ms"; m->m_off = (char *)&g_conf.m_httpMaxSockets - g; m->m_type = TYPE_LONG; // up this some, am seeing sockets closed because of using gb // as a cache... m->m_def = "300"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max https sockets"; m->m_desc = "Maximum sockets available to serve incoming HTTPS " "requests. Like max http sockets, but for secure sockets."; m->m_cgi = "mss"; m->m_off = (char *)&g_conf.m_httpsMaxSockets - g; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "spider user agent"; m->m_desc = "Identification seen by web servers when " "the Gigablast spider downloads their web pages. " "It is polite to insert a contact email address here so " "webmasters that experience problems from the Gigablast " "spider have somewhere to vent."; m->m_cgi = "sua"; m->m_off = (char *)&g_conf.m_spiderUserAgent - g; m->m_type = TYPE_STRING; m->m_size = USERAGENTMAXSIZE; m->m_def = "GigablastOpenSource/1.0"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "use temporary cluster"; m->m_desc = "Used by proxy to point to a temporary cluster while the " "original cluster is updated with a new binary. The " "temporary cluster is the same as the original cluster but " "the ports are all incremented by one from what is in " "the hosts.conf. This should ONLY be used for the proxy."; m->m_cgi = "aotp"; m->m_off = (char *)&g_conf.m_useTmpCluster - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "url injection enabled"; m->m_desc = "If enabled you can directly inject URLs into the index."; m->m_cgi = "ie"; m->m_off = (char *)&g_conf.m_injectionEnabled - g; m->m_type = TYPE_BOOL; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_def = "1"; m++; */ m->m_title = "init QA tests"; m->m_desc = "If initiated gb performs some integrity tests " "to ensure injecting, spidering and searching works " "properly. Uses ./test/ subdirectory. Injects " "urls in ./test/inject.txt. Spiders urls " "in ./test/spider.txt. " "Each of those two files is essentially a simple format of " "a url followed by the http reply received from the server " "for that url. " // TODO: generate these files ; m->m_cgi = "qasptei"; m->m_type = TYPE_CMD; m->m_func = CommandSpiderTestInit; m->m_def = "1"; m->m_cast = 1; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "init parser test run"; m->m_desc = "If enabled gb injects the urls in the " "./test-parser/urls.txt " "file and outputs ./test-parser/qa.html"; m->m_cgi = "qaptei"; m->m_type = TYPE_CMD; m->m_func = CommandParserTestInit; m->m_def = "1"; m->m_cast = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "init spider test run"; m->m_desc = "If enabled gb injects the urls in " "./test-spider/spider.txt " "and spiders links."; m->m_cgi = "qasptei"; m->m_type = TYPE_CMD; m->m_func = CommandSpiderTestInit; m->m_def = "1"; m->m_cast = 1; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "continue spider test run"; m->m_desc = "Resumes the test."; m->m_cgi = "qaspter"; m->m_type = TYPE_CMD; m->m_func = CommandSpiderTestCont; m->m_def = "1"; m->m_cast = 1; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "do docid range splitting"; m->m_desc = "Split msg39 docids into ranges to save mem?"; m->m_cgi = "ddrs"; m->m_off = (char *)&g_conf.m_doDocIdRangeSplitting - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; */ m->m_title = "qa search test enabled"; m->m_desc = "If enabled gb does the search queries in " "./test-search/queries.txt and compares to the last run and " "outputs the diffs for inspection and validation."; m->m_cgi = "qasste"; m->m_off = (char *)&g_conf.m_testSearchEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "1"; //m->m_cast = 0; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "just save"; m->m_desc = "Copies the data in memory to disk for just this host. " "Does Not exit."; m->m_cgi = "js"; m->m_type = TYPE_CMD; m->m_func = CommandJustSave; m->m_page = PAGE_MASTER; m->m_cast = 0; m++; */ m->m_title = "save"; m->m_desc = "Saves in-memory data for ALL hosts. Does Not exit."; m->m_cgi = "js"; m->m_type = TYPE_CMD; m->m_func = CommandJustSave; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "all spiders on"; m->m_desc = "Enable spidering on all hosts"; m->m_cgi = "ase"; m->m_def = "1"; m->m_off = (char *)&g_conf.m_spideringEnabled - g; m->m_type = TYPE_BOOL2; // no yes or no, just a link m++; m->m_title = "all spiders off"; m->m_desc = "Disable spidering on all hosts"; m->m_cgi = "ase"; m->m_def = "0"; m->m_off = (char *)&g_conf.m_spideringEnabled - g; m->m_type = TYPE_BOOL2; // no yes or no, just a link m++; */ /* m->m_title = "save & exit"; m->m_desc = "Copies the data in memory to disk for just this host " "and then shuts down the gb process."; m->m_cgi = "save"; m->m_type = TYPE_CMD; m->m_func = CommandSaveAndExit; m->m_cast = 0; m++; m->m_title = "urgent save & exit"; m->m_desc = "Copies the data in memory to disk for just this host " "and then shuts down the gb process."; m->m_cgi = "usave"; m->m_type = TYPE_CMD; m->m_func = CommandUrgentSaveAndExit; m->m_cast = 0; m->m_priv = 4; m++; */ m->m_title = "save & exit"; m->m_desc = "Saves the data and exits for ALL hosts."; m->m_cgi = "save"; m->m_type = TYPE_CMD; m->m_func = CommandSaveAndExit; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "rebalance shards"; m->m_desc = "Tell all hosts to scan all records in all databases, " "and move " "records to the shard they belong to. You only need to run " "this if Gigablast tells you to, when you are changing " "hosts.conf to add or remove more nodes/hosts."; m->m_cgi = "rebalance"; m->m_type = TYPE_CMD; m->m_func = CommandRebalance; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dump to disk"; m->m_desc = "Flushes all records in memory to the disk on all hosts."; m->m_cgi = "dump"; m->m_type = TYPE_CMD; m->m_func = CommandDiskDump; m->m_cast = 1; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "force reclaim"; m->m_desc = "Force reclaim of doledb mem."; m->m_cgi = "forceit"; m->m_type = TYPE_CMD; m->m_func = CommandForceIt; m->m_cast = 1; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; m->m_title = "tight merge posdb"; m->m_desc = "Merges all outstanding posdb (index) files."; m->m_cgi = "pmerge"; m->m_type = TYPE_CMD; m->m_func = CommandMergePosdb; m->m_cast = 1; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; //m->m_title = "tight merge sectiondb"; //m->m_desc = "Merges all outstanding sectiondb files."; //m->m_cgi = "smerge"; //m->m_type = TYPE_CMD; //m->m_func = CommandMergeSectiondb; //m->m_cast = 1; //m++; m->m_title = "tight merge titledb"; m->m_desc = "Merges all outstanding titledb (web page cache) files."; m->m_cgi = "tmerge"; m->m_type = TYPE_CMD; m->m_func = CommandMergeTitledb; m->m_cast = 1; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "tight merge spiderdb"; m->m_desc = "Merges all outstanding spiderdb files."; m->m_cgi = "spmerge"; m->m_type = TYPE_CMD; m->m_func = CommandMergeSpiderdb; m->m_cast = 1; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "clear kernel error message"; m->m_desc = "Clears the kernel error message. You must do this " "to stop getting email alerts for a kernel ring buffer " "error alert."; m->m_cgi = "clrkrnerr"; m->m_type = TYPE_CMD; m->m_func = CommandClearKernelError; m->m_cast = 1; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "disk page cache off"; m->m_desc = "Disable all disk page caches to save mem for " "tmp cluster. Run " "gb cacheoff to do for all hosts."; m->m_cgi = "dpco"; m->m_type = TYPE_CMD; m->m_func = CommandDiskPageCacheOff; m->m_cast = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; //m->m_title = "http server enabled"; //m->m_desc = "Disable this if you do not want anyone hitting your " // "http server. Admin and local IPs are still permitted, " // "however."; //m->m_cgi = "hse"; //m->m_off = (char *)&g_conf.m_httpServerEnabled - g; //m->m_type = TYPE_BOOL; //m->m_def = "1"; //m++; /* m->m_title = "ad feed enabled"; m->m_desc = "Serves ads unless pure=1 is in cgi parms."; m->m_cgi = "afe"; m->m_off = (char *)&g_conf.m_adFeedEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_scgi = "ads"; m->m_soff = (char *)&si.m_adFeedEnabled - y; m->m_sparm = 1; m->m_priv = 2; m++; */ m->m_title = "do stripe balancing"; m->m_desc = "Stripe #n contains twin #n from each group. Doing " "stripe balancing helps prevent too many query requests " "coming into one host. This parm is only for the proxy. " "Stripe balancing is done by default unless the parm is " "disabled on the proxy in which case it appends a " "&dsb=0 to the query url it sends to the host. The proxy " "alternates to which host it forwards the incoming query " "based on the stripe. It takes the number of query terms in " "the query into account to make a more even balance."; m->m_cgi = "dsb"; m->m_off = (char *)&g_conf.m_doStripeBalancing - g; m->m_type = TYPE_BOOL; m->m_def = "1"; //m->m_scgi = "dsb"; //m->m_soff = (char *)&si.m_doStripeBalancing - y; //m->m_sparm = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "is live cluster"; m->m_desc = "Is this cluster part of a live production cluster? " "If this is true we make sure that elvtune is being " "set properly for best performance, otherwise, gb will " "not startup."; m->m_cgi = "live"; m->m_off = (char *)&g_conf.m_isLive - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "is BuzzLogic"; m->m_desc = "Is this a BuzzLogic cluster?"; m->m_cgi = "isbuzz"; m->m_off = (char *)&g_conf.m_isBuzzLogic - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ // we use wikipedia cluster for quick categorization m->m_title = "is wikipedia cluster"; m->m_desc = "Is this cluster just used for indexing wikipedia pages?"; m->m_cgi = "iswiki"; m->m_off = (char *)&g_conf.m_isWikipedia - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "ask for gzipped docs when downloading"; m->m_desc = "If this is true, gb will send Accept-Encoding: gzip " "to web servers when doing http downloads. It does have " "a tendency to cause out-of-memory errors when you enable " "this, so until that is fixed better, it's probably a good " "idea to leave this disabled."; m->m_cgi = "afgdwd"; m->m_off = (char *)&g_conf.m_gzipDownloads - g; m->m_type = TYPE_BOOL; // keep this default off because it seems some pages are huge // uncomressed causing OOM errors and possibly corrupting stuff? // not sure exactly, but i don't like going OOM. so maybe until // that is fixed leave this off. m->m_def = "0"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "search results cache max age"; m->m_desc = "How many seconds should we cache a search results " "page for?"; m->m_cgi = "srcma"; m->m_off = (char *)&g_conf.m_searchResultsMaxCacheAge - g; m->m_def = "10800"; // 3 hrs m->m_type = TYPE_LONG; m->m_units = "seconds"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "autoban IPs which violate the queries per day quotas"; m->m_desc = "Keep track of ips which do queries, disallow " "non-customers from hitting us too hard."; m->m_cgi = "ab"; m->m_off = (char *)&g_conf.m_doAutoBan - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; if ( g_isYippy ) { m->m_title = "Max outstanding search requests out for yippy"; m->m_desc = "Max outstanding search requests out for yippy"; m->m_cgi = "ymo"; m->m_off = (char *)&g_conf.m_maxYippyOut - g; m->m_type = TYPE_LONG; m->m_def = "150"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; } m->m_title = "free queries per day "; m->m_desc = "Non-customers get this many queries per day before" "being autobanned"; m->m_cgi = "nfqpd"; m->m_off = (char *)&g_conf.m_numFreeQueriesPerDay - g; m->m_type = TYPE_LONG; m->m_def = "1024"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "free queries per minute "; m->m_desc = "Non-customers get this many queries per minute before" "being autobanned"; m->m_cgi = "nfqpm"; m->m_off = (char *)&g_conf.m_numFreeQueriesPerMinute - g; m->m_type = TYPE_CHAR; m->m_def = "30"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max heartbeat delay in milliseconds"; m->m_desc = "If a heartbeat is delayed this many milliseconds " "dump a core so we can see where the CPU was. " "Logs 'db: missed heartbeat by %"INT64" ms'. " "Use 0 or less to disable."; m->m_cgi = "mhdms"; m->m_off = (char *)&g_conf.m_maxHeartbeatDelay - g; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_flags = PF_CLONE; // PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max delay before logging a callback or handler"; m->m_desc = "If a call to a message callback or message handler " "in the udp server takes more than this many milliseconds, " "then log it. " "Logs 'udp: Took %"INT64" ms to call callback for msgType=" "0x%hhx niceness=%"INT32"'. " "Use -1 or less to disable the logging."; m->m_cgi = "mdch"; m->m_off = (char *)&g_conf.m_maxCallbackDelay - g; m->m_type = TYPE_LONG; m->m_def = "-1"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "sendmail IP"; m->m_desc = "We send crawlbot notification emails to this sendmail " "server which forwards them to the specified email address."; m->m_cgi = "smip"; m->m_off = (char *)&g_conf.m_sendmailIp - g; m->m_type = TYPE_STRING; m->m_def = "10.5.54.47"; m->m_size = MAX_MX_LEN; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send email alerts"; m->m_desc = "Sends emails to admin if a host goes down."; m->m_cgi = "sea"; m->m_off = (char *)&g_conf.m_sendEmailAlerts - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "delay non critical email alerts"; m->m_desc = "Do not send email alerts about dead hosts to " "anyone except sysadmin@gigablast.com between the times " "given below unless all the twins of the dead host are " "also dead. Instead, wait till after if the host " "is still dead. "; m->m_cgi = "dnca"; m->m_off = (char *)&g_conf.m_delayNonCriticalEmailAlerts - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; //m->m_title = "send email alerts to matt at tmobile 450-3518"; //m->m_desc = "Sends to cellphone."; //m->m_cgi = "seatmt"; //m->m_off = (char *)&g_conf.m_sendEmailAlertsToMattTmobile - g; //m->m_type = TYPE_BOOL; //m->m_def = "1"; //m->m_priv = 2; //m->m_group = 0; //m++; //m->m_title = "send email alerts to matt at alltel 362-6809"; /* m->m_title = "send email alerts to matt at alltel 450-3518"; m->m_desc = "Sends to cellphone."; m->m_cgi = "seatmv"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToMattAlltell - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_priv = 2; m->m_group = 0; m++; m->m_title = "send email alerts to javier"; m->m_desc = "Sends to cellphone."; m->m_cgi = "seatj"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToJavier - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m++; */ // m->m_title = "send email alerts to melissa"; // m->m_desc = "Sends to cell phone."; // m->m_cgi = "seatme"; // m->m_off = (char *)&g_conf.m_sendEmailAlertsToMelissa - g; // m->m_type = TYPE_BOOL; // m->m_def = "0"; // m->m_priv = 2; // m->m_group = 0; // m++; /* m->m_title = "send email alerts to partap"; m->m_desc = "Sends to cell phone."; m->m_cgi = "seatp"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToPartap - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m++; */ // m->m_title = "send email alerts to cinco"; // m->m_desc = "Sends to cell phone."; // m->m_cgi = "seatc"; // m->m_off = (char *)&g_conf.m_sendEmailAlertsToCinco - g; // m->m_type = TYPE_BOOL; // m->m_def = "0"; // m->m_priv = 2; // m->m_group = 0; // m++; /* m->m_title = "maximum hops from parent page"; m->m_desc = "Only index pages that are within a particular number " "of hops from the parent page given in Page Add Url. -1 means " "that max hops is infinite."; m->m_cgi = "mnh"; m->m_off = (char *)&cr.m_maxNumHops - x; m->m_type = TYPE_CHAR2; m->m_def = "-1"; m->m_group = 0; m++;*/ m->m_title = "cluster name"; m->m_desc = "Email alerts will include the cluster name"; m->m_cgi = "cn"; m->m_off = (char *)&g_conf.m_clusterName - g; m->m_type = TYPE_STRING; m->m_size = 32; m->m_def = "unspecified"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_title = "spider round start time"; m->m_desc = "When the next spider round starts. If you force this to " "zero it sets it to the current time. That way you can " "respider all the urls that were already spidered, and urls " "that were not yet spidered in the round will still be " "spidered."; m->m_cgi = "spiderRoundStart"; m->m_size = 0; m->m_off = (char *)&cr.m_spiderRoundStartTime - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ; m++; // DIFFBOT: // this http parm actually ads the "forceround" parm to the parmlist // below with the appropriate args. m->m_title = "manually restart a spider round"; m->m_desc = "Updates round number and resets local processed " "and crawled counts to 0."; m->m_cgi = "roundStart"; m->m_type = TYPE_CMD; m->m_func = NULL; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_HIDDEN; m++; // DIFFBOT: // . this is sent to each shard by issuing a "&roundStart=1" cmd // . similar to the "addcoll" cmd we add args to it and make it // the "forceround" cmd parm and add THAT to the parmlist. // so "roundStart=1" is really an alias for us. m->m_title = "manually restart a spider round on shard"; m->m_desc = "Updates round number and resets local processed " "and crawled counts to 0."; m->m_cgi = "forceround"; //m->m_off = (char *)&cr.m_spiderRoundStartTime - x; m->m_type = TYPE_CMD; m->m_func = CommandForceNextSpiderRound; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS ; m++; m->m_title = "spider round num"; m->m_desc = "The spider round number."; m->m_cgi = "spiderRoundNum"; m->m_off = (char *)&cr.m_spiderRoundNum - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_HIDDEN ; m++; m->m_title = "send email alerts to sysadmin"; m->m_desc = "Sends to sysadmin@gigablast.com."; m->m_cgi = "seatsa"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToSysadmin - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "send email alerts to zak"; m->m_desc = "Sends to zak@gigablast.com."; m->m_cgi = "seatz"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToZak - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m++; m->m_title = "send email alerts to sabino"; m->m_desc = "Sends to cell phone."; m->m_cgi = "seatms"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToSabino - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m++; */ m->m_title = "dead host timeout"; m->m_desc = "Consider a host in the Gigablast network to be dead if " "it does not respond to successive pings for this number of " "seconds. Gigablast does not send requests to dead hosts. " "Outstanding requests may be re-routed to a twin."; m->m_cgi = "dht"; m->m_off = (char *)&g_conf.m_deadHostTimeout - g; m->m_type = TYPE_LONG; m->m_def = "4000"; m->m_units = "milliseconds"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send email timeout"; m->m_desc = "Send an email after a host has not responded to " "successive pings for this many milliseconds."; m->m_cgi = "set"; m->m_off = (char *)&g_conf.m_sendEmailTimeout - g; m->m_type = TYPE_LONG; m->m_def = "62000"; m->m_priv = 2; m->m_units = "milliseconds"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "ping spacer"; m->m_desc = "Wait this many milliseconds before pinging the next " "host. Each host pings all other hosts in the network."; m->m_cgi = "ps"; m->m_off = (char *)&g_conf.m_pingSpacer - g; m->m_min = 50; // i've seen values of 0 hammer the cpu m->m_type = TYPE_LONG; m->m_def = "100"; m->m_units = "milliseconds"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; //m->m_title = "max query time"; //m->m_desc = "When computing the average query latency " // "truncate query latency times to this so that " // "a single insanely int32_t query latency time does " // "not trigger the alarm. This is in seconds."; //m->m_cgi = "mqlr"; //m->m_off = (char *)&g_conf.m_maxQueryTime - g; //m->m_type = TYPE_FLOAT; //m->m_def = "30.0"; //m->m_priv = 2; //m->m_group = 0; //m++; m->m_title = "query success rate threshold"; m->m_desc = "Send email alerts when query success rate goes below " "this threshold. (percent rate between 0.0 and 1.0)"; m->m_cgi = "qsrt"; m->m_off = (char *)&g_conf.m_querySuccessThreshold - g; m->m_type = TYPE_FLOAT; m->m_def = "0.850000"; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "average query latency threshold"; m->m_desc = "Send email alerts when average query latency goes above " "this threshold. (in seconds)"; m->m_cgi = "aqpst"; m->m_off = (char *)&g_conf.m_avgQueryTimeThreshold - g; m->m_type = TYPE_FLOAT; // a titlerec fetch times out after 2 seconds and is re-routed m->m_def = "2.000000"; m->m_priv = 2; m->m_units = "seconds"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "number of query times in average"; m->m_desc = "Record this number of query times before calculating " "average query latency."; m->m_cgi = "nqt"; m->m_off = (char *)&g_conf.m_numQueryTimes - g; m->m_type = TYPE_LONG; m->m_def = "300"; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max corrupt index lists"; m->m_desc = "If we reach this many corrupt index lists, send " "an admin email. Set to -1 to disable."; m->m_cgi = "mcil"; m->m_off = (char *)&g_conf.m_maxCorruptLists - g; m->m_type = TYPE_LONG; m->m_def = "5"; m->m_priv = 2; m->m_group = 0; m->m_flags = PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max hard drive temperature"; m->m_desc = "At what temperature in Celsius should we send " "an email alert if a hard drive reaches it?"; m->m_cgi = "mhdt"; m->m_off = (char *)&g_conf.m_maxHardDriveTemp - g; m->m_type = TYPE_LONG; m->m_def = "45"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "delay emails after"; m->m_desc = "If delay non critical email alerts is on, don't send " "emails after this time. Time is hh:mm. Time is take from " "host #0's system clock in UTC."; m->m_cgi = "dea"; m->m_off = (char *)&g_conf.m_delayEmailsAfter - g; m->m_type = TYPE_TIME; // time format -- very special m->m_def = "00:00"; m->m_priv = 2; m++; m->m_title = "delay emails before"; m->m_desc = "If delay non critical email alerts is on, don't send " "emails before this time. Time is hh:mm Time is take from " "host #0's system clock in UTC."; m->m_cgi = "deb"; m->m_off = (char *)&g_conf.m_delayEmailsBefore - g; m->m_type = TYPE_TIME; // time format -- very special m->m_def = "00:00"; m->m_priv = 2; m++; */ /* Disable this until it works. m->m_title = "use merge token"; m->m_desc = "If used, prevents twins, or hosts on the same ide " "channel, from merging simultaneously."; m->m_cgi = "umt"; m->m_off = (char *)&g_conf.m_useMergeToken - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m++; */ m->m_title = "error string 1"; m->m_desc = "Look for this string in the kernel buffer for sending " "email alert. Useful for detecting some strange " "hard drive failures that really slow performance."; m->m_cgi = "errstrone"; m->m_off = (char *)&g_conf.m_errstr1 - g; m->m_type = TYPE_STRING; m->m_def = "I/O error"; m->m_size = MAX_URL_LEN; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "error string 2"; m->m_desc = "Look for this string in the kernel buffer for sending " "email alert. Useful for detecting some strange " "hard drive failures that really slow performance."; m->m_cgi = "errstrtwo"; m->m_off = (char *)&g_conf.m_errstr2 - g; m->m_type = TYPE_STRING; m->m_def = ""; m->m_size = MAX_URL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "error string 3"; m->m_desc = "Look for this string in the kernel buffer for sending " "email alert. Useful for detecting some strange " "hard drive failures that really slow performance."; m->m_cgi = "errstrthree"; m->m_off = (char *)&g_conf.m_errstr3 - g; m->m_type = TYPE_STRING; m->m_def = ""; m->m_size = MAX_URL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send email alerts to email 1"; m->m_desc = "Sends to email address 1 through email server 1."; m->m_cgi = "seatone"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail1 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send parm change email alerts to email 1"; m->m_desc = "Sends to email address 1 through email server 1 if " "any parm is changed."; m->m_cgi = "seatonep"; m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail1 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email server 1"; m->m_desc = "Connects to this IP or hostname " "directly when sending email 1. " "Use apt-get install sendmail to install sendmail " "on that IP or hostname. Add From:10.5 RELAY to " "/etc/mail/access to allow sendmail to forward email it " "receives from gigablast if gigablast hosts are on the " "10.5.*.* IPs. Then run /etc/init.d/sendmail restart " "as root to pick up those changes so sendmail will forward " "Gigablast's email to the email address you give below."; m->m_cgi = "esrvone"; m->m_off = (char *)&g_conf.m_email1MX - g; m->m_type = TYPE_STRING; m->m_def = "127.0.0.1"; m->m_size = MAX_MX_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email address 1"; m->m_desc = "Sends to this address when sending email 1 "; m->m_cgi = "eaddrone"; m->m_off = (char *)&g_conf.m_email1Addr - g; m->m_type = TYPE_STRING; m->m_def = "4081234567@vtext.com"; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "from email address 1"; m->m_desc = "The from field when sending email 1 "; m->m_cgi = "efaddrone"; m->m_off = (char *)&g_conf.m_email1From - g; m->m_type = TYPE_STRING; m->m_def = "sysadmin@mydomain.com"; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send email alerts to email 2"; m->m_desc = "Sends to email address 2 through email server 2."; m->m_cgi = "seattwo"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail2 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send parm change email alerts to email 2"; m->m_desc = "Sends to email address 2 through email server 2 if " "any parm is changed."; m->m_cgi = "seattwop"; m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail2 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email server 2"; m->m_desc = "Connects to this server directly when sending email 2 "; m->m_cgi = "esrvtwo"; m->m_off = (char *)&g_conf.m_email2MX - g; m->m_type = TYPE_STRING; m->m_def = "mail.mydomain.com"; m->m_size = MAX_MX_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email address 2"; m->m_desc = "Sends to this address when sending email 2 "; m->m_cgi = "eaddrtwo"; m->m_off = (char *)&g_conf.m_email2Addr - g; m->m_type = TYPE_STRING; m->m_def = ""; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "from email address 2"; m->m_desc = "The from field when sending email 2 "; m->m_cgi = "efaddrtwo"; m->m_off = (char *)&g_conf.m_email2From - g; m->m_type = TYPE_STRING; m->m_def = "sysadmin@mydomain.com"; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send email alerts to email 3"; m->m_desc = "Sends to email address 3 through email server 3."; m->m_cgi = "seatthree"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail3 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send parm change email alerts to email 3"; m->m_desc = "Sends to email address 3 through email server 3 if " "any parm is changed."; m->m_cgi = "seatthreep"; m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail3 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email server 3"; m->m_desc = "Connects to this server directly when sending email 3 "; m->m_cgi = "esrvthree"; m->m_off = (char *)&g_conf.m_email3MX - g; m->m_type = TYPE_STRING; m->m_def = "mail.mydomain.com"; m->m_size = MAX_MX_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email address 3"; m->m_desc = "Sends to this address when sending email 3 "; m->m_cgi = "eaddrthree"; m->m_off = (char *)&g_conf.m_email3Addr - g; m->m_type = TYPE_STRING; m->m_def = ""; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "from email address 3"; m->m_desc = "The from field when sending email 3 "; m->m_cgi = "efaddrthree"; m->m_off = (char *)&g_conf.m_email3From - g; m->m_type = TYPE_STRING; m->m_def = "sysadmin@mydomain.com"; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send email alerts to email 4"; m->m_desc = "Sends to email address 4 through email server 4."; m->m_cgi = "seatfour"; m->m_off = (char *)&g_conf.m_sendEmailAlertsToEmail4 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send parm change email alerts to email 4"; m->m_desc = "Sends to email address 4 through email server 4 if " "any parm is changed."; m->m_cgi = "seatfourp"; m->m_off = (char *)&g_conf.m_sendParmChangeAlertsToEmail4 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email server 4"; m->m_desc = "Connects to this server directly when sending email 4 "; m->m_cgi = "esrvfour"; m->m_off = (char *)&g_conf.m_email4MX - g; m->m_type = TYPE_STRING; m->m_def = "mail.mydomain.com"; m->m_size = MAX_MX_LEN; m->m_priv = 2; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "email address 4"; m->m_desc = "Sends to this address when sending email 4 "; m->m_cgi = "eaddrfour"; m->m_off = (char *)&g_conf.m_email4Addr - g; m->m_type = TYPE_STRING; m->m_def = ""; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "from email address 4"; m->m_desc = "The from field when sending email 4 "; m->m_cgi = "efaddrfour"; m->m_off = (char *)&g_conf.m_email4From - g; m->m_type = TYPE_STRING; m->m_def = "sysadmin@mydomain.com"; m->m_size = MAX_EMAIL_LEN; m->m_priv = 2; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "prefer local reads"; m->m_desc = "If you have scsi drives or a slow network, say yes here " "to minimize data fetches across the network."; m->m_cgi = "plr"; m->m_off = (char *)&g_conf.m_preferLocalReads - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "use biased tfndb"; m->m_desc = "Should we always send titledb record lookup requests " "to a particular host in order to increase tfndb page cache " "hits? This bypasses load balancing and may result in " "slower hosts being more of a bottleneck. Keep this disabled " "unless you notice tfndb disk seeks slowing things down."; m->m_cgi = "ubu"; m->m_off = (char *)&g_conf.m_useBiasedTfndb - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; */ // this is ifdef'd out in Msg3.cpp for performance reasons, // so do it here, too #ifdef GBSANITYCHECK m->m_title = "max corrupted read retries"; m->m_desc = "How many times to retry disk reads that had corrupted " "data before requesting the list from a twin, and, if that " "fails, removing the bad data."; m->m_cgi = "crr"; m->m_off = (char *)&g_conf.m_corruptRetries - g; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; #endif m->m_title = "do incremental updating"; m->m_desc = "When reindexing a document, do not re-add data " "that should already be in index or clusterdb " "since the last time the document was indexed. Otherwise, " "re-add the data regardless."; m->m_cgi = "oic"; //m->m_off = (char *)&g_conf.m_onlyAddUnchangedTermIds - g; m->m_off = (char *)&g_conf.m_doIncrementalUpdating - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; // you can really screw up the index if this is false, so // comment it out for now /* m->m_title = "index deletes"; m->m_desc = "Should we allow indexdb recs to be deleted? This is " "always true, except in very rare indexdb rebuilds."; m->m_cgi = "id"; m->m_off = (char *)&g_conf.m_indexDeletes - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; */ m->m_title = "use etc hosts"; m->m_desc = "Use /etc/hosts file to resolve hostnames? the " "/etc/host file is reloaded every minute, so if you make " "a change to it you might have to wait one minute for the " "change to take affect."; m->m_cgi = "ueh"; m->m_off = (char *)&g_conf.m_useEtcHosts - g; m->m_def = "0"; m->m_type = TYPE_BOOL; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "twins are split"; m->m_desc = "If enabled, Gigablast assumes the first half of " "machines in hosts.conf " "are on a different network switch than the second half, " "and minimizes transmits between the switches."; m->m_cgi = "stw"; m->m_off = (char *)&g_conf.m_splitTwins - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "do out of memory testing"; m->m_desc = "When enabled Gigablast will randomly fail at " "allocating memory. Used for testing stability."; m->m_cgi = "dot"; m->m_off = (char *)&g_conf.m_testMem - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "do consistency testing"; m->m_desc = "When enabled Gigablast will make sure it reparses " "the document exactly the same way. It does this every " "1000th document anyway, but enabling this makes it do it " "for every document."; m->m_cgi = "dct"; m->m_off = (char *)&g_conf.m_doConsistencyTesting - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "use shotgun"; m->m_desc = "If enabled, all servers must have two gigabit " "ethernet ports hooked up and Gigablast will round robin " "packets between both ethernet ports when sending to another " "host. Can speed up network transmissions as much as 2x."; m->m_cgi = "usht"; m->m_off = (char *)&g_conf.m_useShotgun - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "use quickpoll"; m->m_desc = "If enabled, Gigablast will use quickpoll. Significantly " "improves performance. Only turn this off for testing."; m->m_cgi = "uqp"; m->m_off = (char *)&g_conf.m_useQuickpoll - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; // m->m_title = "quickpoll core on error"; // m->m_desc = "If enabled, quickpoll will terminate the process and " // "generate a core file when callbacks are called with the " // "wrong niceness."; // m->m_cgi = "qpoe"; // m->m_off = (char *)&g_conf.m_quickpollCoreOnError - g; // m->m_type = TYPE_BOOL; // m->m_def = "1"; // m++; // . this will leak the shared mem if the process is Ctrl+C'd // . that is expected behavior // . you can clean up the leaks using 'gb freecache 20000000' // and use 'ipcs -m' to see what leaks you got // . generally, only the main gb should use shared mem, so // keep this off for teting m->m_title = "use shared mem"; m->m_desc = "If enabled, Gigablast will use shared memory. " "Should really only be used on the live cluster, " "keep this on the testing cluster since it can " "leak easily."; m->m_cgi = "ushm"; m->m_off = (char *)&g_conf.m_useSHM - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; // disable disk caches... for testing really /* m->m_title = "use disk page cache for indexdb"; m->m_desc = "Use disk page cache?"; m->m_cgi = "udpci"; m->m_off = (char *)&g_conf.m_useDiskPageCacheIndexdb - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m++; */ m->m_title = "posdb disk cache size"; m->m_desc = "How much file cache size to use in bytes? Posdb is " "the index."; m->m_cgi = "dpcsp"; m->m_off = (char *)&g_conf.m_posdbFileCacheSize - g; m->m_type = TYPE_LONG_LONG; m->m_def = "30000000"; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "tagdb disk cache size"; m->m_desc = "How much file cache size to use in bytes? Tagdb is " "consulted at spider time and query time to determine " "if a url or outlink is banned or what its siterank is, etc."; m->m_cgi = "dpcst"; m->m_off = (char *)&g_conf.m_tagdbFileCacheSize - g; m->m_type = TYPE_LONG_LONG; m->m_def = "30000000"; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "clusterdb disk cache size"; m->m_desc = "How much file cache size to use in bytes? " "Gigablast does a " "lookup in clusterdb for each search result at query time to " "get its site information for site clustering. If you " "disable site clustering in the search controls then " "clusterdb will not be consulted."; m->m_cgi = "dpcsc"; m->m_off = (char *)&g_conf.m_clusterdbFileCacheSize - g; m->m_type = TYPE_LONG_LONG; m->m_def = "30000000"; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "titledb disk cache size"; m->m_desc = "How much file cache size to use in bytes? Titledb " "holds the cached web pages, compressed. Gigablast consults " "it to generate a summary for a search result, or to see if " "a url Gigablast is spidering is already in the index."; m->m_cgi = "dpcsx"; m->m_off = (char *)&g_conf.m_titledbFileCacheSize - g; m->m_type = TYPE_LONG_LONG; m->m_def = "30000000"; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "spiderdb disk cache size"; m->m_desc = "How much file cache size to use in bytes? Titledb " "holds the cached web pages, compressed. Gigablast consults " "it to generate a summary for a search result, or to see if " "a url Gigablast is spidering is already in the index."; m->m_cgi = "dpcsy"; m->m_off = (char *)&g_conf.m_spiderdbFileCacheSize - g; m->m_type = TYPE_LONG_LONG; m->m_def = "30000000"; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; /* m->m_title = "exclude link text"; m->m_desc = "Exclude search results that have one or more query " "that only appear in the incoming link text"; m->m_cgi = "exlt"; m->m_off = (char *)&g_conf.m_excludeLinkText - g; m->m_sparm = 1; m->m_soff = (char *)&si.m_excludeLinkText - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_scgi = "excludelinktext"; m++; m->m_title = "exclude meta text"; m->m_desc = "Exclude search results that have one or more query " "that only appear in the meta text"; m->m_cgi = "exmt"; m->m_off = (char *)&g_conf.m_excludeMetaText - g; m->m_sparm = 1; m->m_soff = (char *)&si.m_excludeMetaText - y; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_scgi = "excludemetatext"; m++; */ m->m_title = "scan all if not found"; m->m_desc = "Scan all titledb files if rec not found. You should " "keep this on to avoid corruption. Do not turn it off unless " "you are Matt Wells."; m->m_cgi = "sainf"; m->m_off = (char *)&g_conf.m_scanAllIfNotFound - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "interface machine"; m->m_desc = "for specifying if this is an interface machine" "messages are rerouted from this machine to the main" "cluster set in the hosts.conf."; m->m_cgi = "intmch"; m->m_off = (char *)&g_conf.m_interfaceMachine - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "generate vector at query time"; m->m_desc = "At query time, should Gigablast generate content " "vectors for title records lacking them? This is an " "expensive operation, so is really just for testing purposes."; m->m_cgi = "gv"; m->m_off = (char *)&g_conf.m_generateVectorAtQueryTime - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "redirect non-raw traffic"; m->m_desc = "If this is non empty, http traffic will be redirected " "to the specified address."; m->m_cgi = "redir"; m->m_off = (char *)&g_conf.m_redirect - g; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_def = ""; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "send requests to compression proxy"; m->m_desc = "If this is true, gb will route download requests for" " web pages to proxies in hosts.conf. Proxies will" " download and compress docs before sending back. "; m->m_cgi = "srtcp"; m->m_off = (char *)&g_conf.m_useCompressionProxy - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "synchronize proxy to cluster time"; m->m_desc = "Enable/disable the ability to synchronize time between " "the cluster and the proxy"; m->m_cgi = "sptct"; m->m_off = (char *)&g_conf.m_timeSyncProxy - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "use data feed account server"; m->m_desc = "Enable/disable the use of a remote account verification " "for Data Feed Customers. This should ONLY be used for the " "proxy."; m->m_cgi = "pdfuas"; m->m_off = (char *)&g_conf.m_useDFAcctServer - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "data feed server ip"; m->m_desc = "The ip address of the Gigablast data feed server to " "retrieve customer account information from. This should ONLY " "be used for the proxy."; m->m_cgi = "pdfip"; m->m_off = (char *)&g_conf.m_dfAcctIp - g; m->m_type = TYPE_IP; m->m_def = "2130706433"; m->m_group = 0; m++; m->m_title = "data feed server port"; m->m_desc = "The port of the Gigablast data feed server to retrieve " "customer account information from. This should ONLY be used " "for the proxy"; m->m_cgi = "pdfport"; m->m_off = (char *)&g_conf.m_dfAcctPort - g; m->m_type = TYPE_LONG; m->m_def = "8040"; m->m_group = 0; m++; m->m_title = "data feed server collection"; m->m_desc = "The collection on the Gigablast data feed server to " "retrieve customer account information from. This should ONLY " "be used for the proxy."; m->m_cgi = "pdfcoll"; m->m_off = (char *)&g_conf.m_dfAcctColl - g; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN; m->m_def = "customers"; m->m_group = 0; m++; */ m->m_title = "allow scaling of hosts"; m->m_desc = "Allows scaling up of hosts by deleting recs not in " "the correct group. This should only happen why copying " "a set of servers to the new hosts. Otherwise corrupted " "data will cause a halt."; m->m_cgi = "asoh"; m->m_off = (char *)&g_conf.m_allowScale - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "allow bypass of db validation"; m->m_desc = "Allows bypass of db validation so gigablast will not " "halt if a corrupt db is discovered during load. Use this " "when attempting to load with a collection that has known " "corruption."; m->m_cgi = "abov"; m->m_off = (char *)&g_conf.m_bypassValidation - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "reload language pages"; m->m_desc = "Reloads language specific pages."; m->m_cgi = "rlpages"; m->m_type = TYPE_CMD; m->m_func = CommandReloadLanguagePages; m->m_cast = 0; m++; m->m_title = "proxy port"; m->m_desc = "Retrieve pages from the proxy on " "this port."; m->m_cgi = "proxyport"; m->m_off = (char *)&cr.m_proxyPort - x; m->m_type = TYPE_LONG; m->m_def = "0"; m++; m->m_title = "all reload language pages"; m->m_desc = "Reloads language specific pages for all hosts."; m->m_cgi = "rlpages"; m->m_type = TYPE_CMD; m++; */ // do we need this any more? /* m->m_title = "give up on dead hosts"; m->m_desc = "Give up requests to dead hosts. Only set this when you " "know a host is dead and will not come back online without " "a restarting all hosts. Messages will timeout on the dead " "host but will not error, allowing outstanding spidering to " "finish to the twin."; m->m_cgi = "gvup"; m->m_off = (char *)&g_conf.m_giveupOnDeadHosts - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ /* m->m_title = "ask root name servers"; m->m_desc = "if enabled Gigablast will direct DNS requests to " "the root DNS servers, otherwise it will continue to " "send DNS queries to the bind9 servers defined in " "the Master Controls."; m->m_cgi = "bdns"; m->m_off = (char *)&g_conf.m_askRootNameservers - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m++; */ /* m->m_title = "do dig sanity checks"; m->m_desc = "call dig @nameServer hostname and on timedout lookups" " and see if dig also timed out"; m->m_cgi = "dig"; m->m_off = (char *)&g_conf.m_useDig - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ /* m->m_title = "dns root name server 1"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsa"; m->m_off = (char *)&g_conf.m_rnsIps[0] - g; m->m_type = TYPE_IP; m->m_def = "192.228.79.201"; m++; m->m_title = "dns root name server 2"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsb"; m->m_off = (char *)&g_conf.m_rnsIps[1] - g; m->m_type = TYPE_IP; m->m_def = "192.33.4.12"; m++; m->m_title = "dns root name server 3"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsc"; m->m_off = (char *)&g_conf.m_rnsIps[2] - g; m->m_type = TYPE_IP; m->m_def = "128.8.10.90"; m++; m->m_title = "dns root name server 4"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsd"; m->m_off = (char *)&g_conf.m_rnsIps[3] - g; m->m_type = TYPE_IP; m->m_def = "192.203.230.10"; m++; m->m_title = "dns root name server 5"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnse"; m->m_off = (char *)&g_conf.m_rnsIps[4] - g; m->m_type = TYPE_IP; m->m_def = "192.5.5.241"; m++; m->m_title = "dns root name server 6"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsf"; m->m_off = (char *)&g_conf.m_rnsIps[5] - g; m->m_type = TYPE_IP; m->m_def = "192.112.36.4"; m++; m->m_title = "dns root name server 7"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsg"; m->m_off = (char *)&g_conf.m_rnsIps[6] - g; m->m_type = TYPE_IP; m->m_def = "128.63.2.53"; m++; m->m_title = "dns root name server 8"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsh"; m->m_off = (char *)&g_conf.m_rnsIps[7] - g; m->m_type = TYPE_IP; m->m_def = "192.36.148.17"; m++; m->m_title = "dns root name server 9"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsi"; m->m_off = (char *)&g_conf.m_rnsIps[8] - g; m->m_type = TYPE_IP; m->m_def = "192.58.128.30"; m++; m->m_title = "dns root name server 10"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsj"; m->m_off = (char *)&g_conf.m_rnsIps[9] - g; m->m_type = TYPE_IP; m->m_def = "193.0.14.129"; m++; m->m_title = "dns root name server 11"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsk"; m->m_off = (char *)&g_conf.m_rnsIps[10] - g; m->m_type = TYPE_IP; m->m_def = "198.32.64.12"; m++; m->m_title = "dns root name server 12"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsl"; m->m_off = (char *)&g_conf.m_rnsIps[11] - g; m->m_type = TYPE_IP; m->m_def = "202.12.27.33"; m++; m->m_title = "dns root name server 13"; m->m_desc = "IP address of a DNS root server. Assumes UDP " "port 53."; m->m_cgi = "rnsm"; m->m_off = (char *)&g_conf.m_rnsIps[12] - g; m->m_type = TYPE_IP; m->m_def = "198.41.0.4"; m++; */ m->m_title = "dns 0"; m->m_desc = "IP address of the primary DNS server. Assumes UDP " "port 53. REQUIRED FOR SPIDERING! Use Google's " "public DNS 8.8.8.8 as default."; m->m_cgi = "pdns"; m->m_off = (char *)&g_conf.m_dnsIps[0] - g; m->m_type = TYPE_IP; // default to google public dns #1 m->m_def = "8.8.8.8"; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 1"; m->m_desc = "IP address of the secondary DNS server. Assumes UDP " "port 53. Will be accessed in conjunction with the primary " "dns, so make sure this is always up. An ip of 0 means " "disabled. Google's secondary public DNS is 8.8.4.4."; m->m_cgi = "sdns"; m->m_off = (char *)&g_conf.m_dnsIps[1] - g; m->m_type = TYPE_IP; // default to google public dns #2 m->m_def = "8.8.4.4"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 2"; m->m_desc = "All hosts send to these DNSes based on hash " "of the subdomain to try to split DNS load evenly."; m->m_cgi = "sdnsa"; m->m_off = (char *)&g_conf.m_dnsIps[2] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 3"; m->m_desc = ""; m->m_cgi = "sdnsb"; m->m_off = (char *)&g_conf.m_dnsIps[3] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 4"; m->m_desc = ""; m->m_cgi = "sdnsc"; m->m_off = (char *)&g_conf.m_dnsIps[4] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 5"; m->m_desc = ""; m->m_cgi = "sdnsd"; m->m_off = (char *)&g_conf.m_dnsIps[5] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 6"; m->m_desc = ""; m->m_cgi = "sdnse"; m->m_off = (char *)&g_conf.m_dnsIps[6] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 7"; m->m_desc = ""; m->m_cgi = "sdnsf"; m->m_off = (char *)&g_conf.m_dnsIps[7] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 8"; m->m_desc = ""; m->m_cgi = "sdnsg"; m->m_off = (char *)&g_conf.m_dnsIps[8] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 9"; m->m_desc = ""; m->m_cgi = "sdnsh"; m->m_off = (char *)&g_conf.m_dnsIps[9] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 10"; m->m_desc = ""; m->m_cgi = "sdnsi"; m->m_off = (char *)&g_conf.m_dnsIps[10] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 11"; m->m_desc = ""; m->m_cgi = "sdnsj"; m->m_off = (char *)&g_conf.m_dnsIps[11] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 12"; m->m_desc = ""; m->m_cgi = "sdnsk"; m->m_off = (char *)&g_conf.m_dnsIps[12] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 13"; m->m_desc = ""; m->m_cgi = "sdnsl"; m->m_off = (char *)&g_conf.m_dnsIps[13] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 14"; m->m_desc = ""; m->m_cgi = "sdnsm"; m->m_off = (char *)&g_conf.m_dnsIps[14] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dns 15"; m->m_desc = ""; m->m_cgi = "sdnsn"; m->m_off = (char *)&g_conf.m_dnsIps[15] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "geocoder IP #1"; m->m_desc = ""; m->m_cgi = "gca"; m->m_off = (char *)&g_conf.m_geocoderIps[0] - g; m->m_type = TYPE_IP; m->m_def = "10.5.66.11"; // sp1 m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "geocoder IP #2"; m->m_desc = ""; m->m_cgi = "gcb"; m->m_off = (char *)&g_conf.m_geocoderIps[1] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "geocoder IP #3"; m->m_desc = ""; m->m_cgi = "gcc"; m->m_off = (char *)&g_conf.m_geocoderIps[2] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "geocoder IP #4"; m->m_desc = ""; m->m_cgi = "gcd"; m->m_off = (char *)&g_conf.m_geocoderIps[3] - g; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "wiki proxy ip"; m->m_desc = "Access the wiki coll through this proxy ip"; m->m_cgi = "wpi"; m->m_off = (char *)&g_conf.m_wikiProxyIp - g; m->m_type = TYPE_IP; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "wiki proxy port"; m->m_desc = "Access the wiki coll through this proxy port"; m->m_cgi = "wpp"; m->m_off = (char *)&g_conf.m_wikiProxyPort - g; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "default collection"; m->m_desc = "When no collection is explicitly specified, assume " "this collection name."; m->m_cgi = "dcn"; m->m_off = (char *)&g_conf.m_defaultColl - g; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN+1; m->m_def = ""; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "directory collection"; m->m_desc = "Collection to be used for directory searching and " "display of directory topic pages."; m->m_cgi = "dircn"; m->m_off = (char *)&g_conf.m_dirColl - g; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN+1; m->m_def = "main"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "directory hostname"; m->m_desc = "Hostname of the server providing the directory. " "Leave empty to use this host."; m->m_cgi = "dirhn"; m->m_off = (char *)&g_conf.m_dirHost - g; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_def = ""; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max incoming bandwidth for spider"; m->m_desc = "Total incoming bandwidth used by all spiders should " "not exceed this many kilobits per second. "; m->m_cgi = "mkbps"; m->m_off = (char *)&g_conf.m_maxIncomingKbps - g; m->m_type = TYPE_FLOAT; m->m_def = "999999.0"; m->m_units = "Kbps"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max 1-minute sliding-window loadavg"; m->m_desc = "Spiders will shed load when their host exceeds this " "value for the 1-minute load average in /proc/loadavg. " "The value 0.0 disables this feature."; m->m_cgi = "mswl"; m->m_off = (char *)&g_conf.m_maxLoadAvg - g; m->m_type = TYPE_FLOAT; m->m_def = "0.0"; m->m_units = ""; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max pages per second"; m->m_desc = "Maximum number of pages to index or delete from index " "per second for all hosts combined."; m->m_cgi = "mpps"; m->m_off = (char *)&g_conf.m_maxPagesPerSecond - g; m->m_type = TYPE_FLOAT; m->m_def = "999999.0"; m->m_units = "pages/second"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "distributed spider balance"; m->m_desc = "Max number of ready domains a host can have distributed " "to it by all other host. This should be some multiple of the " "total number of hosts in the cluster."; m->m_cgi = "dsb"; m->m_off = (char *)&g_conf.m_distributedSpiderBalance - g; m->m_type = TYPE_LONG; m->m_def = "1024"; m->m_units = "domains"; m++; m->m_title = "distributed same ip wait (hack)"; m->m_desc = "Amount of time to wait if this IP is already being " "downloaded by a host. Works only in conjunction with " "distribute spider downloads by ip in Spider Controls."; m->m_cgi = "dsiw"; m->m_off = (char *)&g_conf.m_distributedIpWait - g; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_units = "ms"; m->m_group = 0; m->m_min = 0; m++; */ /* m->m_title = "root quality max cache age base"; m->m_desc = "Maximum age to cache quality of a root url in seconds. " "Computing " "the quality of especially root urls can be expensive. " "This number is multiplied by (Q-30)/10 where Q is the cached " "quality of the root url. Therefore, higher quality and more " "stable root urls are updated less often, which is a good thing " "since they are more expensive to recompute."; m->m_cgi = "rqmca"; m->m_off = (char *)&g_conf.m_siteQualityMaxCacheAge - g; m->m_type = TYPE_LONG; m->m_def = "7257600"; // 3 months (in seconds) m->m_units = "seconds"; m++; */ m->m_title = "use threads"; m->m_desc = "If enabled, Gigablast will use threads."; m->m_cgi = "ut"; m->m_off = (char *)&g_conf.m_useThreads - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; // turn off for now. after seeing how SLOOOOOW brian's merge op was // when all 16 shards on a 16-core machine were merging (even w/ SSDs) // i turned threads off and it was over 100x faster. so until we have // pooling or something turn these off m->m_title = "use threads for disk"; m->m_desc = "If enabled, Gigablast will use threads for disk ops. " "Now that Gigablast uses pthreads more effectively, " "leave this enabled for optimal performance in all cases."; //"Until pthreads is any good leave this off. If you have " //"SSDs performance can be as much as 100x better."; m->m_cgi = "utfd"; m->m_off = (char *)&g_conf.m_useThreadsForDisk - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = 0;//PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "use threads for intersects and merges"; m->m_desc = "If enabled, Gigablast will use threads for these ops. " "Default is now on in the event you have simultaneous queries " "so one query does not hold back the other. There seems " "to be a bug so leave this ON for now."; //"Until pthreads is any good leave this off."; m->m_cgi = "utfio"; m->m_off = (char *)&g_conf.m_useThreadsForIndexOps - g; m->m_type = TYPE_BOOL; // enable this in the event of multiple cores available and // large simultaneous queries coming in m->m_def = "1"; m->m_flags = 0;//PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "use threads for system calls"; m->m_desc = "Gigablast does not make too many system calls so " "leave this on in case the system call is slow."; m->m_cgi = "utfsc"; m->m_off = (char *)&g_conf.m_useThreadsForSystemCalls - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = 0;//PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "max cpu threads"; m->m_desc = "Maximum number of threads to use per Gigablast process " "for intersecting docid lists."; m->m_cgi = "mct"; m->m_off = (char *)&g_conf.m_maxCpuThreads - g; m->m_type = TYPE_LONG; // make it 3 for new gb in case one query takes way longer // than the others m->m_def = "6"; // "2"; m->m_units = "threads"; m->m_min = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "max cpu merge threads"; m->m_desc = "Maximum number of threads to use per Gigablast process " "for merging lists read from disk."; m->m_cgi = "mcmt"; m->m_off = (char *)&g_conf.m_maxCpuMergeThreads - g; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_units = "threads"; m->m_min = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "max write threads"; m->m_desc = "Maximum number of threads to use per Gigablast process " "for writing data to the disk. " "Keep low to reduce file interlace effects and impact " "on query response time."; m->m_cgi = "mwt"; m->m_off = (char *)&g_conf.m_maxWriteThreads - g; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_units = "threads"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "flush disk writes"; m->m_desc = "If enabled then all writes will be flushed to disk. " "If not enabled, then gb uses the Linux disk write cache."; m->m_cgi = "fw"; m->m_off = (char *)&g_conf.m_flushWrites - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; /* m->m_title = "files group writable"; m->m_desc = "Make all created files group writable? If you have " "multiple user accounts starting Gigablast processes you " "will want the files to be group writable. You will " "need to make sure you run gigablast under the " "primary group you want to use for gigablast administration."; m->m_cgi = "afgw"; m->m_off = (char *)&g_conf.m_makeAllFilesGroupWritable - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; */ m->m_title = "verify written lists"; m->m_desc = "Ensure lists being written to disk are not corrupt. " "That title recs appear valid, etc. Helps isolate sources " "of corruption. Used for debugging."; m->m_cgi = "vwl"; m->m_off = (char *)&g_conf.m_verifyDumpedLists - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "verify disk writes"; m->m_desc = "Read what was written in a verification step. Decreases " "performance, but may help fight disk corruption mostly on " "Maxtors and Western Digitals."; m->m_cgi = "vdw"; m->m_off = (char *)&g_conf.m_verifyWrites - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; m->m_title = "max spider read threads"; m->m_desc = "Maximum number of threads to use per Gigablast process " "for accessing the disk " "for index-building purposes. Keep low to reduce impact " "on query response time. Increase for fast disks or when " "preferring build speed over lower query latencies"; m->m_cgi = "smdt"; m->m_off = (char *)&g_conf.m_spiderMaxDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "20"; m->m_units = "threads"; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; /* m->m_title = "max spider big read threads"; m->m_desc = "This particular number applies to all disk " "reads above 1MB. " "The number of total threads is also " "limited to MAX_STACKS which is currently 20."; m->m_cgi = "smbdt"; m->m_off = (char *)&g_conf.m_spiderMaxBigDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_units = "threads"; m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max spider medium read threads"; m->m_desc = "This particular number applies to all disk " "reads above 100K. " "The number of total threads is also " "limited to MAX_STACKS which is currently 20."; m->m_cgi = "smmdt"; m->m_off = (char *)&g_conf.m_spiderMaxMedDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "4"; m->m_units = "threads"; m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max spider small read threads"; m->m_desc = "This particular number applies to all disk " "reads above 1MB. " "The number of total threads is also " "limited to MAX_STACKS which is currently 20."; m->m_cgi = "smsdt"; m->m_off = (char *)&g_conf.m_spiderMaxSmaDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "15"; m->m_units = "threads"; m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; */ m->m_title = "separate disk reads"; m->m_desc = "If enabled then we will not launch a low priority " "disk read or write while a high priority is outstanding. " "Help improve query response time at the expense of " "spider performance."; m->m_cgi = "sdt"; m->m_off = (char *)&g_conf.m_separateDiskReads - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = 0; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "max query read threads"; m->m_desc = "Maximum number of threads to use per Gigablast process " "for accessing the disk " "for querying purposes."; //IDE systems tend to be more " // "responsive when this is low. Increase for SCSI or RAID " // "systems."; m->m_cgi = "qmdt"; m->m_off = (char *)&g_conf.m_queryMaxDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_units = "threads"; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m->m_group = 0; m++; */ /* m->m_title = "max query big read threads"; m->m_desc = "This particular number applies to all reads above 1MB. " "The number of total threads is also " "limited to MAX_STACKS which is currently 20."; m->m_cgi = "qmbdt"; m->m_off = (char *)&g_conf.m_queryMaxBigDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "20"; // 1 m->m_units = "threads"; m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max query medium read threads"; m->m_desc = "This particular number applies to all disk " "reads above 100K. " "The number of total threads is also " "limited to MAX_STACKS which is currently 20."; m->m_cgi = "qmmdt"; m->m_off = (char *)&g_conf.m_queryMaxMedDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "20"; // 3 m->m_units = "threads"; m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "max query small read threads"; m->m_desc = "This particular number applies to all disk " "reads above 1MB. " "The number of total threads is also " "limited to MAX_STACKS which is currently 20."; m->m_cgi = "qmsdt"; m->m_off = (char *)&g_conf.m_queryMaxSmaDiskThreads - g; m->m_type = TYPE_LONG; m->m_def = "20"; m->m_units = "threads"; m->m_group = 0; m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; */ m->m_title = "min popularity for speller"; m->m_desc = "Word or phrase must be present in this percent " "of documents in order to qualify as a spelling " "recommendation."; m->m_cgi = "mps"; m->m_off = (char *)&g_conf.m_minPopForSpeller - g; m->m_type = TYPE_FLOAT; m->m_def = ".01"; m->m_units = "%%"; m->m_priv = 2; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "phrase weight"; m->m_desc = "Percent to weight phrases in queries."; m->m_cgi = "qp"; m->m_off = (char *)&g_conf.m_queryPhraseWeight - g; m->m_type = TYPE_FLOAT; // was 350, but 'new mexico tourism' and 'boots uk' // emphasized the phrase terms too much!! m->m_def = "100"; m->m_units = "%%"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "weights.cpp slider parm (tmp)"; m->m_desc = "Percent of how much to use words to phrase ratio weights."; m->m_cgi = "wsp"; m->m_off = (char *)&g_conf.m_sliderParm - g; m->m_type = TYPE_LONG; m->m_def = "90"; m->m_units = "%%"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "indextable intersection algo to use"; m->m_desc = "0 means adds the term scores, 1 means average them " "and 2 means take the RMS."; m->m_cgi = "iia"; m->m_off = (char *)&g_conf.m_indexTableIntersectionAlgo - g; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_group = 0; m++; */ /* m->m_title = "max weight"; m->m_desc = "Maximum, relative query term weight. Set to 0 or less " "to indicate now max. 10.0 or 20.0 might be a good value."; m->m_cgi = "qm"; m->m_off = (char *)&g_conf.m_queryMaxMultiplier - g; m->m_type = TYPE_FLOAT; m->m_def = "0.0"; m->m_group = 0; m++; */ /* m->m_title = "query term exponent"; m->m_desc = "Raise the weights of the query " "terms to this power. The weight of a query term is " "basically the log of its term frequency. Increasing " "this will increase the effects of the term frequency " "related to each term in the query. Term frequency is " "also known as the term popularity. Very common words " "typically have lower weights tied to them, but the effects " "of such weighting will be increased if you increase this " "exponent."; m->m_cgi = "qte"; m->m_off = (char *)&g_conf.m_queryExp - g; m->m_type = TYPE_FLOAT; m->m_def = "1.1"; m->m_group = 0; m++; */ /* m->m_title = "use dynamic phrase weighting"; m->m_desc = "A new algorithm which reduces the weight on a query " "word term if the query phrase terms it is in are of " "similar popularity (term frequency) to that of the word " "term."; m->m_cgi = "udpw"; m->m_off = (char *)&g_conf.m_useDynamicPhraseWeighting - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; */ m->m_title = "maximum serialized query size"; m->m_desc = "When passing queries around the network, send the raw " "string instead of the serialized query if the required " "buffer is bigger than this. Smaller values decrease network " "traffic for large queries at the expense of processing time."; m->m_cgi = "msqs"; m->m_off = (char *)&g_conf.m_maxSerializedQuerySize - g; m->m_type = TYPE_LONG; m->m_def = "8192"; m->m_units = "bytes"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "merge buf size"; m->m_desc = "Read and write this many bytes at a time when merging " "files. Smaller values are kinder to query performance, " " but the merge takes longer. Use at least 1000000 for " "fast merging."; m->m_cgi = "mbs"; m->m_off = (char *)&g_conf.m_mergeBufSize - g; m->m_type = TYPE_LONG; // keep this way smaller than that 800k we had in here, 100k seems // to be way better performance for qps m->m_def = "500000"; m->m_units = "bytes"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "catdb minRecSizes"; m->m_desc = "minRecSizes for Catdb lookups"; m->m_cgi = "catmsr"; m->m_off = (char *)&g_conf.m_catdbMinRecSizes - g; m->m_type = TYPE_LONG; m->m_def = "100000000"; // 100 million m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "max http download sockets"; m->m_desc = "Maximum sockets available to spiders for downloading " "web pages."; m->m_cgi = "mds"; m->m_off = (char *)&g_conf.m_httpMaxDownloadSockets - g; m->m_type = TYPE_LONG; m->m_def = "5000"; m->m_group = 0; m++; */ m->m_title = "doc count adjustment"; m->m_desc = "Add this number to the total document count in the " "index. Just used for displaying on the homepage."; m->m_cgi = "dca"; m->m_off = (char *)&g_conf.m_docCountAdjustment - g; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "dynamic performance graph"; m->m_desc = "Generates profiling data for callbacks on page " "performance"; m->m_cgi = "dpg"; m->m_off = (char *)&g_conf.m_dynamicPerfGraph - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "enable profiling"; m->m_desc = "Enable profiler to do accounting of time taken by " "functions. "; m->m_cgi = "enp"; m->m_off = (char *)&g_conf.m_profilingEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "minimum profiling threshold"; m->m_desc = "Profiler will not show functions which take less " "than this many milliseconds " "in the log or on the performance graph."; m->m_cgi = "mpt"; m->m_off = (char *)&g_conf.m_minProfThreshold - g; m->m_type = TYPE_LONG; m->m_def = "10"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "sequential profiling."; m->m_desc = "Produce a LOG_TIMING log message for each " "callback called, along with the time it took. " "Profiler must be enabled."; m->m_cgi = "ensp"; m->m_off = (char *)&g_conf.m_sequentialProfiling - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; m->m_title = "use statsdb"; m->m_desc = "Archive system statistics information in Statsdb."; m->m_cgi = "usdb"; m->m_off = (char *)&g_conf.m_useStatsdb - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_MASTER; m->m_obj = OBJ_CONF; m++; /* m->m_title = "statsdb snapshots."; m->m_desc = "Archive system statistics information in Statsdb. " "Takes one snapshot every minute."; m->m_cgi = "sdbss"; m->m_off = (char *)&g_conf.m_statsdbSnapshots - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "statsdb web interface."; m->m_desc = "Enable the Statsdb page for viewing stats history."; m->m_cgi = "sdbwi"; m->m_off = (char *)&g_conf.m_statsdbPageEnabled - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; */ /* m->m_title = "max synonyms"; m->m_desc = "Maximum possible synonyms to expand a word to."; m->m_cgi = "msyn"; m->m_off = (char *)&g_conf.m_maxSynonyms - g; m->m_def = "5"; m->m_type = TYPE_LONG; m++; m->m_title = "default affinity"; m->m_desc = "spelling/number synonyms get this number as their " "affinity; negative values mean treat them as unknown, " "values higher than 1.0 get treated as 1.0"; m->m_cgi = "daff"; m->m_off = (char *)&g_conf.m_defaultAffinity - g; m->m_def = "0.9"; m->m_type = TYPE_FLOAT; m++; m->m_title = "frequency threshold"; m->m_desc = "the minimum amount a synonym term has to be in relation " "to its master term in order to be considered as a synonym"; m->m_cgi = "fqth"; m->m_off = (char *)&g_conf.m_frequencyThreshold - g; m->m_def = "0.25"; m->m_type = TYPE_FLOAT; m++; m->m_title = "maximum affinity requests"; m->m_desc = "Maximum number of outstanding requests the affinity " "builder can generate. Keep this number at 10 or lower for " "local servers, higher for internet servers or servers with " "high latency."; m->m_cgi = "mar"; m->m_off = (char *)&g_conf.m_maxAffinityRequests - g; m->m_def = "10"; m->m_type = TYPE_LONG; m->m_group = 0; m++; m->m_title = "maximum affinity errors"; m->m_desc = "Maximum number of times the affinity builder should " "encounter an error before giving up entirely."; m->m_cgi = "mae"; m->m_off = (char *)&g_conf.m_maxAffinityErrors - g; m->m_def = "100"; m->m_type = TYPE_LONG; m->m_group = 0; m++; m->m_title = "affinity timeout"; m->m_desc = "Amount of time in milliseconds to wait for a response to " "an affinity query. You shouldn't have to touch this unless " "the network is slow or overloaded."; m->m_cgi = "ato"; m->m_off = (char *)&g_conf.m_affinityTimeout - g; m->m_def = "30000"; m->m_type = TYPE_LONG; m->m_group = 0; m++; m->m_title = "affinity rebuild server"; m->m_desc = "Use this server:port to rebuild the affinity."; m->m_cgi = "ars"; m->m_off = (char *)&g_conf.m_affinityServer - g; m->m_def = "localhost:8000"; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_group = 0; m++; m->m_title = "additional affinity parameters"; m->m_desc = "Additional parameters to pass in the query. Tweak these " "to get better/faster responses. Don't touch the raw parameter " "unless you know what you are doing."; m->m_cgi = "aap"; m->m_off = (char *)&g_conf.m_affinityParms - g; m->m_def = "&raw=5&dio=1&n=1000&code=gbmonitor"; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_group = 0; m++; */ ////// // END MASTER CONTROLS ////// /////////////////////////////////////////// // ACCESS CONTROLS /////////////////////////////////////////// /* // ARRAYS // each will have its own table, title will be in first row // of that table, 2nd row is description, then one row per // element in the array, then a final row for adding new elements // if not exceeding our m->m_max limit. m->m_title = "Passwords Required to Search this Collection"; m->m_desc ="Passwords allowed to perform searches on this collection." " If no passwords are specified, then anyone can search it."; m->m_cgi = "searchpwd"; m->m_xml = "searchPassword"; m->m_max = MAX_SEARCH_PASSWORDS; m->m_off = (char *)cr.m_searchPwds - x; m->m_type = TYPE_STRINGNONEMPTY; m->m_size = PASSWORD_MAX_LEN+1; // string size max m->m_page = PAGE_ACCESS; m->m_def = ""; m++; m->m_title = "IPs Banned from Searching this Collection"; m->m_desc = "These IPs are not allowed to search this collection or " "use add url. Useful to keep out miscreants. Use zero for the " "last number of the IP to ban an entire IP domain."; m->m_cgi = "bip"; m->m_xml = "bannedIp"; m->m_max = MAX_BANNED_IPS; m->m_off = (char *)cr.m_banIps - x; m->m_type = TYPE_IP; m->m_def = ""; m++; m->m_title = "Only These IPs can Search this Collection"; m->m_desc = "Only these IPs are allowed to search the collection and " "use the add url facilities. If you'd like to make your " "collection publicly searchable then do not add any IPs " "here.Use zero for the " "last number of the IP to restrict to an entire " "IP domain, i.e. 1.2.3.0."; m->m_cgi = "searchip"; m->m_xml = "searchIp"; m->m_max = MAX_SEARCH_IPS; m->m_off = (char *)cr.m_searchIps - x; m->m_type = TYPE_IP; m->m_def = ""; m++; m->m_title = "Spam Assassin IPs"; m->m_desc = "Browsers coming from these IPs are deemed to be spam " "assassins and have access to a subset of the controls to " "ban and remove domains and IPs from the index."; m->m_cgi = "assip"; m->m_xml = "assassinIp"; m->m_max = MAX_SPAM_IPS; m->m_off = (char *)cr.m_spamIps - x; m->m_type = TYPE_IP; m->m_def = ""; m++; m->m_title = "Admin Passwords"; m->m_desc = "Passwords allowed to edit this collection record. " "First password can only be deleted by the master " "administrator. If no password of Admin IP is given at time " "of creation then the default password of 'footbar23' will " "be assigned."; m->m_cgi = "apwd"; m->m_xml = "adminPassword"; m->m_max = MAX_ADMIN_PASSWORDS; m->m_off = (char *)cr.m_adminPwds - x; m->m_type = TYPE_STRINGNONEMPTY; m->m_size = PASSWORD_MAX_LEN+1; m->m_def = ""; m++; m->m_title = "Admin IPs"; m->m_desc = "If someone connects from one of these IPs and provides " "a password from the table above then they will have full " "administrative privileges for this collection. If you " "specified no Admin Passwords above then they need only " "connect from an IP in this table to get the privledges. "; m->m_cgi = "adminip"; m->m_xml = "adminIp"; m->m_max = MAX_ADMIN_IPS; m->m_off = (char *)cr.m_adminIps - x; m->m_type = TYPE_IP; m->m_def = ""; m++; */ /////////////////////////////////////////// // URL FILTERS /////////////////////////////////////////// //m->m_title = "Url Filters"; // this is description just for the conf file. //m->m_cdesc = "See overview.html for a description of URL filters."; //m->m_type = TYPE_COMMENT; //m++; m->m_cgi = "ufp"; m->m_title = "url filters profile"; m->m_xml = "urlFiltersProfile"; m->m_desc = "Rather than editing the table below, you can select " "a predefined set of url instructions in this drop down menu " "that will update the table for you. Selecting custom " "allows you to make custom changes to the table. " "Selecting web configures the table for spidering " "the web in general. " "Selecting news configures the table for spidering " "new sites. " "Selecting chinese makes the spider prioritize the " "spidering of chinese pages, etc. " "Selecting shallow makes the spider go deep on " "all sites unless they are tagged shallow in the " "site list. " "Important: " "If you select a profile other than custom " "then your changes " "to the table will be lost."; m->m_off = (char *)&cr.m_urlFiltersProfile - x; m->m_colspan = 3; m->m_type = TYPE_SAFEBUF;//UFP;// 1 byte dropdown menu m->m_def = "web"; // UFP_WEB m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m->m_page = PAGE_FILTERS; m->m_obj = OBJ_COLL; m++; m->m_title = "expression"; m->m_desc = "Before downloading the contents of a URL, Gigablast " "first chains down this " "list of " "expressions, " "starting with expression #0. " //"This table is also consulted " //"for every outlink added to spiderdb. " "The first expression it matches is the ONE AND ONLY " "matching row for that url. " "It then uses the " //"" "respider frequency, " //"" "spider priority, etc. on the MATCHING ROW when spidering " //"and ruleset to " "that URL. " "If you specify the expression as " "default then that MATCHES ALL URLs. " "URLs with high spider priorities take spidering " "precedence over " "URLs with lower spider priorities. " "The respider frequency dictates how often a URL will " "be respidered. " "See the help table below for examples of all the supported " "expressions. " "Use the && operator to string multiple expressions " "together in the same expression text box. " "If you check the delete checkbox then urls matching " "that row will be deleted if already indexed, otherwise, " "they just won't be indexed." //"A spider priority of " //"FILTERED or BANNED " // "DELETE " // "will cause the URL to not be spidered, " // "or if it has already " // "been indexed, it will be deleted when it is respidered." "

"; /* "A URL is respidered according to the " "spider frequency. If this is blank then Gigablast will " "use the spider frequency explicitly dictated by the rule " "set. If the ruleset does not contain a " "xml tag, then Gigablast will " "intelligently determine the best time to respider that " "URL.

" "If the " "" "spider priority of a URL is undefined then " "Gigablast will use the spider priority explicitly " "dictated by the ruleset. If the ruleset does not contain " "a xml tag, then Gigablast " "will spider that URL with a priority of its linking parent " "minus 1, " "resulting in breadth first spidering. A URL of spider " "priority X will be placed in spider priority queue #X. " "Many spider parameters can be configured on a per " "spider priority queue basis. For instance, spidering " "can be toggled on a per queue basis, as can link " "harvesting.

" "The ruleset you select corresponds to a file on " "disk named tagdb*.xml, where the '*' is a number. Each of " "these files is a set of rules in XML that dictate how to " "index and spider a document. " "You can add your own ruleset file to Gigablast's working " "directory and it will automatically be " "included in the ruleset drop down menu. Once a document " "has been indexed with a ruleset, then the corresponding " "ruleset file cannot be deleted without risk of corruption." "

" "You can have up to 32 regular expressions. " "Example: ^http://.*\\.uk/ would match all urls from " "the UK. See this " "" "tutorial by example for more information." "

" "Gigablast also supports the following special \"regular " "expressions\": " "link:gigablast and doc:qualityX."; */ m->m_cgi = "fe"; m->m_xml = "filterExpression"; m->m_max = MAX_FILTERS; // array of safebufs i guess... m->m_off = (char *)cr.m_regExs - x; // this is a safebuf, dynamically allocated string really m->m_type = TYPE_SAFEBUF;//STRINGNONEMPTY // the size of each element in the array: m->m_size = sizeof(SafeBuf);//MAX_REGEX_LEN+1; m->m_page = PAGE_FILTERS; m->m_rowid = 1; // if we START a new row m->m_def = ""; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m->m_page = PAGE_FILTERS; m->m_obj = OBJ_COLL; m++; m->m_title = "harvest links"; m->m_cgi = "hspl"; m->m_xml = "harvestLinks"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_harvestLinks - x; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_page = PAGE_FILTERS; m->m_rowid = 1; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m->m_obj = OBJ_COLL; m++; /* m->m_title = "spidering enabled"; m->m_cgi = "cspe"; m->m_xml = "spidersEnabled"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_spidersEnabled - x; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_page = PAGE_FILTERS; m->m_rowid = 1; m->m_flags = PF_REBUILDURLFILTERS; m++; */ m->m_title = "respider frequency (days)"; m->m_cgi = "fsf"; m->m_xml = "filterFrequency"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_spiderFreqs - x; m->m_type = TYPE_FLOAT; // why was this default 0 days? m->m_def = "30.0"; // 0.0 m->m_page = PAGE_FILTERS; m->m_obj = OBJ_COLL; m->m_units = "days"; m->m_rowid = 1; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m++; m->m_title = "max spiders"; m->m_desc = "Do not allow more than this many outstanding spiders " "for all urls in this priority."; // was "per rule" m->m_cgi = "mspr"; m->m_xml = "maxSpidersPerRule"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_maxSpidersPerRule - x; m->m_type = TYPE_LONG; m->m_def = "99"; m->m_page = PAGE_FILTERS; m->m_obj = OBJ_COLL; m->m_rowid = 1; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m++; m->m_title = "max spiders per ip"; m->m_desc = "Allow this many spiders per IP."; m->m_cgi = "mspi"; m->m_xml = "maxSpidersPerIp"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_spiderIpMaxSpiders - x; m->m_type = TYPE_LONG; m->m_def = "7"; m->m_page = PAGE_FILTERS; m->m_obj = OBJ_COLL; m->m_rowid = 1; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m++; m->m_title = "same ip wait (ms)"; m->m_desc = "Wait at least this int32_t before downloading urls from " "the same IP address."; m->m_cgi = "xg"; m->m_xml = "spiderIpWait"; m->m_max = MAX_FILTERS; //m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_spiderIpWaits - x; m->m_type = TYPE_LONG; m->m_def = "1000"; m->m_page = PAGE_FILTERS; m->m_obj = OBJ_COLL; m->m_units = "milliseconds"; m->m_rowid = 1; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m++; /* m->m_title = "page quota"; m->m_cgi = "fsq"; m->m_xml = "filterQuota"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_spiderQuotas - x; m->m_type = TYPE_LONG_LONG; m->m_def = "-1"; // -1 means no quota m->m_page = PAGE_FILTERS; m->m_units = "pages"; m->m_rowid = 1; m++; */ m->m_title = "delete"; m->m_cgi = "fdu"; m->m_xml = "forceDeleteUrls"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_forceDelete - x; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_page = PAGE_FILTERS; m->m_rowid = 1; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m->m_obj = OBJ_COLL; m++; m->m_title = "spider priority"; m->m_cgi = "fsp"; m->m_xml = "filterPriority"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_spiderPriorities - x; m->m_type = TYPE_PRIORITY2; // includes UNDEFINED priority in dropdown m->m_page = PAGE_FILTERS; m->m_obj = OBJ_COLL; m->m_rowid = 1; m->m_def = "50"; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m->m_addin = 1; // "insert" follows? m++; /* m->m_title = "diffbot api"; m->m_cgi = "dapi"; m->m_xml = "diffbotAPI"; m->m_max = MAX_FILTERS; m->m_off = (char *)cr.m_spiderDiffbotApiUrl - x; // HACK: we print a dropdown for this but the value is a string // because the items in the drop down can change so we can't store // an item # here, it has to be a string, i.e. the diffbot api url. // john might add a new custom api to m_diffbotApiList at any time. // so we select the item in the drop down if it matches THIS string. m->m_type = TYPE_SAFEBUF;//DIFFBOT_DROPDOWN; m->m_def = ""; m->m_page = PAGE_FILTERS; m->m_size = sizeof(SafeBuf); m->m_rowid = 1; m->m_addin = 1; // "insert" follows? m->m_flags = PF_REBUILDURLFILTERS | PF_DIFFBOT; m++; */ //m->m_title = "ruleset"; //m->m_cgi = "frs"; //m->m_xml = "filterRuleset"; //m->m_max = MAX_FILTERS; //m->m_off = (char *)cr.m_rulesets - x; //m->m_type = TYPE_RULESET; // int32_t with dropdown of rulesets //m->m_page = PAGE_FILTERS; //m->m_rowid = 1; //m->m_addin = 1; // "insert" follows? //m->m_def = ""; //m++; /* // default rule m->m_title = "DEFAULT"; m->m_desc = "Use the following values by default if no ruleset in " "tagdb matches the URL."; m->m_type = TYPE_CONSTANT; m->m_page = PAGE_FILTERS; m->m_rowid = 2; m->m_hdrs = 0; m++; //m->m_cdesc = "The default parameters if no reg exs above matched."; m->m_cgi = "fsfd"; m->m_xml = "filterFrequencyDefault"; m->m_off = (char *)&cr.m_defaultSpiderFrequency - x; m->m_type = TYPE_FLOAT; m->m_def = "0.0"; m->m_page = PAGE_FILTERS; m->m_units = "days"; m->m_rowid = 2; m->m_hdrs = 0; m++; m->m_cgi = "fsqd"; m->m_xml = "filterQuotaDefault"; m->m_off = (char *)&cr.m_defaultSpiderQuota - x; m->m_type = TYPE_LONG_LONG; m->m_def = "-1"; m->m_page = PAGE_FILTERS; m->m_units = "pages"; m->m_rowid = 2; m->m_hdrs = 0; m++; m->m_cgi = "fspd"; m->m_xml = "filterPriorityDefault"; m->m_off = (char *)&cr.m_defaultSpiderPriority - x; m->m_type = TYPE_PRIORITY2; // includes UNDEFINED priority in dropdown m->m_def = "4"; m->m_page = PAGE_FILTERS; m->m_rowid = 2; m->m_hdrs = 0; m++; */ /* m->m_cgi = "frsd"; m->m_xml = "filterRulesetDefault"; m->m_off = (char *)&cr.m_defaultSiteFileNum - x; m->m_type = TYPE_RULESET; // int32_t with dropdown of rulesets m->m_def = "0"; m->m_page = PAGE_FILTERS; m->m_rowid = 2; m->m_hdrs = 0; m++; */ /* /////////////////////////////////////////// // PRIORITY CONTROLS /////////////////////////////////////////// // . show the priority in this column // . a monotnic sequence repeating each number twice, // basically, div 2 is what "D2" means // . so we get 0,0,1,1,2,2,3,3, ... m->m_title = "priority"; //m->m_desc = "What priority is this spdier queue?"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_type = TYPE_MONOD2; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; // . show an alternating 0 and 1 in this column // because it is type MONOM2, a monotonic sequence // modulus 2. // . so we get 0,1,0,1,0,1,0,1,0,1, ... m->m_title = "is new"; m->m_desc = "Does this priority contain new (unindexed) urls?"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_type = TYPE_MONOM2; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; m->m_title = "spidering enabled"; m->m_desc = "Are spiders enabled for this priority?"; m->m_cgi = "xa"; m->m_xml = "spiderPrioritySpideringEnabled"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_spideringEnabled - x; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; m->m_title = "time slice weight"; m->m_desc = "What percentage of the time to draw urls from " "this priority?"; m->m_cgi = "xb"; m->m_xml = "spiderPriotiyTimeSlice"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_timeSlice - x; m->m_type = TYPE_FLOAT; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; // if we START a new row m->m_def = "100.0"; m->m_units = "%%"; m++; m->m_title = "spidered"; m->m_desc = "How many urls we spidered so far last 5 minutes."; m->m_cgi = "sps"; m->m_xml = "spiderPriotiySpidered"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_spidered - x; m->m_type = TYPE_LONG_CONST; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; // if we START a new row m->m_def = "0"; m->m_sync = false; // do not sync this parm m++; m->m_title = "spider links"; m->m_desc = "Harvest links from the content and add to spiderdb."; m->m_cgi = "xc"; m->m_xml = "spiderPrioritySpiderLinks"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_spiderLinks - x; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; m->m_title = "spider same host outlinks only"; m->m_desc = "Harvest links to the same hostnames (www.xyz.com) " "and add to spiderdb."; m->m_cgi = "xd"; m->m_xml = "spiderPrioritySpiderSameHostnameLinks"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_spiderSameHostnameLinks - x; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; m->m_title = "force links into queue"; m->m_desc = "If slated to be added to this queue, and link is " "already in a non-forced queue, force it into this queue. " "Keep a cache to reduce reptitious adds to this queue."; m->m_cgi = "xdd"; m->m_xml = "spiderPriorityForceQueue"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_autoForceQueue - x; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; m->m_title = "max spiders per ip"; m->m_desc = "Do not allow more than this many simultaneous " "downloads per IP address."; m->m_cgi = "xe"; m->m_xml = "spiderPriorityMaxSpidersPerIp"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_maxSpidersPerIp - x; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; m->m_title = "max spiders per domain"; m->m_desc = "Do not allow more than this many simultaneous " "downloads per domain."; m->m_cgi = "xf"; m->m_xml = "spiderPriorityMaxSpidersPerDom"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_maxSpidersPerDom - x; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m++; m->m_title = "max respider wait (days)"; m->m_desc = "Do not wait longer than this before attempting to " "respider."; m->m_cgi = "xr"; m->m_xml = "spiderPriorityMaxRespiderWait"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_maxRespiderWait - x; m->m_type = TYPE_FLOAT; m->m_def = "180.0"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m->m_units = "days"; m++; m->m_title = "first respider wait (days)"; m->m_desc = "Reschedule a new url for respidering this many days " "from the first time it is actually spidered."; m->m_cgi = "xfrw"; m->m_xml = "spiderPriorityFirstRespiderWait"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_firstRespiderWait - x; m->m_type = TYPE_FLOAT; m->m_def = "60.0"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m->m_units = "days"; m++; m->m_title = "same ip wait (ms)"; m->m_desc = "Wait at least this int32_t before downloading urls from " "the same IP address."; m->m_cgi = "xg"; m->m_xml = "spiderPrioritySameIpWait"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_sameIpWait - x; m->m_type = TYPE_LONG; m->m_def = "10000"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m->m_units = "milliseconds"; m++; m->m_title = "same domain wait (ms)"; m->m_desc = "Wait at least this int32_t before downloading urls from " "the same domain."; m->m_cgi = "xh"; m->m_xml = "spiderPrioritySameDomainWait"; m->m_max = MAX_PRIORITY_QUEUES; m->m_fixed = MAX_PRIORITY_QUEUES; m->m_off = (char *)cr.m_pq_sameDomainWait - x; m->m_type = TYPE_LONG; m->m_def = "10000"; m->m_page = PAGE_PRIORITIES; m->m_rowid = 3; m->m_units = "milliseconds"; m++; */ /////////////////////////////////////////// // SITEDB FILTERS /////////////////////////////////////////// /* m->m_title = "site expression"; m->m_desc = "The site of a url is a substring of that url, which " "defined a set of urls which are all primarily controlled " "by the same entity. The smallest such site of a url is " "returned, because a url can have multiple sites. Like " "fred.blogspot.com is a site and the blogspot.com site " "contains that site."; m->m_cgi = "sdbfe"; m->m_xml = "siteExpression"; m->m_max = MAX_SITE_EXPRESSIONS; m->m_off = (char *)cr.m_siteExpressions - x; m->m_type = TYPE_STRINGNONEMPTY; m->m_size = MAX_SITE_EXPRESSION_LEN+1; m->m_page = PAGE_RULES; m->m_rowid = 1; // if we START a new row m->m_def = ""; m++; m->m_title = "site rule"; m->m_cgi = "sdbsrs"; m->m_xml = "siteRule"; m->m_max = MAX_SITE_EXPRESSIONS; m->m_off = (char *)cr.m_siteRules - x; m->m_type = TYPE_SITERULE; m->m_page = PAGE_RULES; m->m_rowid = 1; m->m_def = "0"; m++; */ /* m->m_title = "siterec default ruleset"; m->m_cgi = "sdbfdr"; m->m_xml = "siterecDefaultRuleset"; m->m_max = MAX_SITEDB_FILTERS; m->m_off = (char *)cr.m_sitedbFilterRulesets - x; m->m_type = TYPE_RULESET; m->m_page = PAGE_FILTERS2; m->m_rowid = 1; m->m_def = "-1"; m++; m->m_title = "ban subdomains"; m->m_cgi = "sdbbsd"; m->m_xml = "siterecBanSubdomains"; m->m_max = MAX_SITEDB_FILTERS; m->m_off = (char *)cr.m_sitedbFilterBanSubdomains - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_FILTERS2; m->m_rowid = 1; m->m_addin = 1; // "insert" follows m->m_def = "0"; m++; */ // /////////////////////////////////////////// // // SPAM CONTROLS // // /////////////////////////////////////////// // m->m_title = "char in url"; // m->m_desc = "url has - or _ or a digit in the domain, " // "has a plus in the cgi part."; // m->m_cgi = "spamctrla"; // m->m_off = (char *)&cr.m_spamTests[CHAR_IN_URL] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "20"; // //m->m_smaxc = (char *)&cr.m_spamMaxes[CHAR_IN_URL] - x; // m->m_group = 1; // m->m_sparm = 0; // m++; // m->m_title = "bad tld"; // m->m_desc = "tld is info or biz"; // m->m_cgi = "spamctrlb"; // m->m_off = (char *)&cr.m_spamTests[BAD_TLD] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "20"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "good tld"; // m->m_desc = "tld is gov, edu or mil"; // m->m_cgi = "spamctrlc"; // m->m_off = (char *)&cr.m_spamTests[GOOD_TLD] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "-20"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "title has spammy words"; // m->m_desc = "Title has spammy words, is all lower case, " // "or has > 200 chars. "; // m->m_cgi = "spamctrld"; // m->m_off = (char *)&cr.m_spamTests[WORD_IN_TITLE] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "20"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "img src to other domains"; // m->m_desc = "Page has img src to other domains. "; // m->m_cgi = "spamctrle"; // m->m_off = (char *)&cr.m_spamTests[IMG_SRC_OTHER_DOMAIN] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "5"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "page has spammy words"; // m->m_desc = "Page has spammy words. "; // m->m_cgi = "spamctrlf"; // m->m_off = (char *)&cr.m_spamTests[SPAMMY_WORDS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "5"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "consecutive link text"; // m->m_desc = "Three consecutive link texts " // "contain the same word. "; // m->m_cgi = "spamctrlg"; // m->m_off = (char *)&cr.m_spamTests[CONSECUTIVE_LINK_TEXT] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "10"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "affiliate company links"; // m->m_desc = "links to amazon, allposters, or zappos. "; // m->m_cgi = "spamctrlh"; // m->m_off = (char *)&cr.m_spamTests[AFFILIATE_LINKS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "10"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "affiliate in links"; // m->m_desc = "Has string 'affiliate' in the links. "; // m->m_cgi = "spamctrli"; // m->m_off = (char *)&cr.m_spamTests[AFFILIATE_LINKS2] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "40"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "Iframe to amazon"; // m->m_desc = "Has an iframe whose src is amazon. "; // m->m_cgi = "spamctrlj"; // m->m_off = (char *)&cr.m_spamTests[IFRAME_TO_AMAZON] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "30"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "int32_t links"; // m->m_desc = "Links to urls which are > 128 chars. "; // m->m_cgi = "spamctrlk"; // m->m_off = (char *)&cr.m_spamTests[LINKS_OVER_128_CHARS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "5"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "links to queries"; // m->m_desc = "links have ?q= or &q= in them. "; // m->m_cgi = "spamctrll"; // m->m_off = (char *)&cr.m_spamTests[LINKS_HAVE_QUERIES] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "5"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "google ad client"; // m->m_desc = "Page has a google ad client. "; // m->m_cgi = "spamctrlm"; // m->m_off = (char *)&cr.m_spamTests[GOOGLE_AD_CLIENT] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "20"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "percent text in links"; // m->m_desc = "percent of text in links (over 50 percent). "; // m->m_cgi = "spamctrln"; // m->m_off = (char *)&cr.m_spamTests[PERCENT_IN_LINKS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "15"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "links to a url with a - or _ in the domain"; // m->m_desc = "Links to a url with a - or _ in the domain"; // m->m_cgi = "spamctrlo"; // m->m_off = (char *)&cr.m_spamTests[DASH_IN_LINK] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "2"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "links to a url which is .info or .biz"; // m->m_desc = "Links to a url which is .info or .biz."; // m->m_cgi = "spamctrlp"; // m->m_off = (char *)&cr.m_spamTests[LINK_TO_BADTLD] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "2"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "links to a dmoz category"; // m->m_desc = "Links to a dmoz category."; // m->m_cgi = "spamctrlq"; // m->m_off = (char *)&cr.m_spamTests[LINKS_ARE_DMOZ_CATS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "4"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "consecutive bold text"; // m->m_desc = "Three consecutive bold texts " // "contain the same word. "; // m->m_cgi = "spamctrlr"; // m->m_off = (char *)&cr.m_spamTests[CONSECUTIVE_BOLD_TEXT] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "10"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "link text doesn't match domain"; // m->m_desc = "Link text looks like a domain, but the link doesn't go there"; // m->m_cgi = "spamctrls"; // m->m_off = (char *)&cr.m_spamTests[LINK_TEXT_NEQ_DOMAIN] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "10"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "force multiplier"; // m->m_desc = "Multiply this by the number of spam categories " // "that have points times the total points, for the final" // " score. Range between 0 and 1."; // m->m_cgi = "frcmult"; // m->m_off = (char *)&cr.m_forceMultiplier - x; // m->m_type = TYPE_FLOAT; // m->m_page = PAGE_SPAM; // m->m_def = "0.01"; // m->m_group = 1; // m->m_sparm = 0; // m++; // /////////////////////// MAXES FOR SPAM CONTROLS /////////////////////// // m->m_title = "max points for char in url"; // m->m_desc = "Max points for url has - or _ or a digit in the domain"; // m->m_cgi = "spammaxa"; // m->m_off = (char *)&cr.m_spamMaxes[CHAR_IN_URL] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 1; // m->m_sparm = 0; // m++; // m->m_title = "max points for bad tld"; // m->m_desc = "Max points for tld is info or biz"; // m->m_cgi = "spammaxb"; // m->m_off = (char *)&cr.m_spamMaxes[BAD_TLD] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_group = 0; // m->m_def = "300"; // m->m_sparm = 0; // m++; // m->m_title = "max points for good tld"; // m->m_desc = "Max points for tld is gov, edu or mil"; // m->m_cgi = "spammaxc"; // m->m_off = (char *)&cr.m_spamMaxes[GOOD_TLD] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for title has spammy words"; // m->m_desc = "Max points for Title has spammy words. "; // m->m_cgi = "spammaxd"; // m->m_off = (char *)&cr.m_spamMaxes[WORD_IN_TITLE] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for img src to other domains"; // m->m_desc = "Max points for Page has img src to other domains. "; // m->m_cgi = "spammaxe"; // m->m_off = (char *)&cr.m_spamMaxes[IMG_SRC_OTHER_DOMAIN] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for page has spammy words"; // m->m_desc = "Max points for Page has spammy words. "; // m->m_cgi = "spammaxf"; // m->m_off = (char *)&cr.m_spamMaxes[SPAMMY_WORDS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for consecutive link text"; // m->m_desc = "Max points for three consecutive link texts" // "contain the same word. "; // m->m_cgi = "spammaxg"; // m->m_off = (char *)&cr.m_spamMaxes[CONSECUTIVE_LINK_TEXT] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for affiliate company links"; // m->m_desc = "Max points for links to amazon, allposters, or zappos. "; // m->m_cgi = "spammaxh"; // m->m_off = (char *)&cr.m_spamMaxes[AFFILIATE_LINKS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for affiliate in links"; // m->m_desc = "Max points for Has string 'affiliate' in the links. "; // m->m_cgi = "spammaxi"; // m->m_off = (char *)&cr.m_spamMaxes[AFFILIATE_LINKS2] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for Iframe to amazon"; // m->m_desc = "Max points for Has an iframe whose src is amazon. "; // m->m_cgi = "spammaxj"; // m->m_off = (char *)&cr.m_spamMaxes[IFRAME_TO_AMAZON] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for int32_t links"; // m->m_desc = "Max points for Links to urls which are > 128 chars. "; // m->m_cgi = "spammaxk"; // m->m_off = (char *)&cr.m_spamMaxes[LINKS_OVER_128_CHARS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for links to queries"; // m->m_desc = "Max points for links have ?q= or &q= in them. "; // m->m_cgi = "spammaxl"; // m->m_off = (char *)&cr.m_spamMaxes[LINKS_HAVE_QUERIES] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m->m_sparm = 0; // m++; // m->m_title = "max points for google ad client"; // m->m_desc = "Max points for Page has a google ad client. "; // m->m_cgi = "spammaxm"; // m->m_off = (char *)&cr.m_spamMaxes[GOOGLE_AD_CLIENT] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m++; // m->m_title = "max points for percent text in links"; // m->m_desc = "Max points for percent of text in links (over 50 percent). "; // m->m_cgi = "spammaxn"; // m->m_off = (char *)&cr.m_spamMaxes[PERCENT_IN_LINKS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m++; // m->m_title = "max points for links have - or _"; // m->m_desc = "Max points for links have - or _"; // m->m_cgi = "spammaxo"; // m->m_off = (char *)&cr.m_spamMaxes[DASH_IN_LINK] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m++; // m->m_title = "max points for links to .info or .biz"; // m->m_desc = "Max points for links to .info or .biz "; // m->m_cgi = "spammaxp"; // m->m_off = (char *)&cr.m_spamMaxes[LINK_TO_BADTLD] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m++; // m->m_title = "max points for links to a dmoz category"; // m->m_desc = "Max points for links to a dmoz category."; // m->m_cgi = "spammaxq"; // m->m_off = (char *)&cr.m_spamMaxes[LINKS_ARE_DMOZ_CATS] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m++; // m->m_title = "max points for consecutive bold text"; // m->m_desc = "Max points for three consecutive bold texts" // "contain the same word. "; // m->m_cgi = "spammaxr"; // m->m_off = (char *)&cr.m_spamMaxes[CONSECUTIVE_BOLD_TEXT] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m++; // m->m_title = "max points for link text doesn't match domain"; // m->m_desc = "Max points for link text doesn't match domain"; // m->m_cgi = "spammaxs"; // m->m_off = (char *)&cr.m_spamMaxes[LINK_TEXT_NEQ_DOMAIN] - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_SPAM; // m->m_def = "300"; // m->m_group = 0; // m++; // /////////////////////////////////////////// // // END SPAM CONTROLS // // /////////////////////////////////////////// /////////////////////////////////////////// // QUALITY AGENT CONTROLS /////////////////////////////////////////// /* m->m_title = "all agents on"; m->m_desc = "Enable quality agent on all hosts for this collection"; m->m_cgi = "aqae"; m->m_obj = OBJ_COLL; m->m_def = "1"; m->m_off = (char *)&cr.m_qualityAgentEnabled - x; m->m_type = TYPE_BOOL2; // no yes or no, just a link m->m_page = PAGE_QAGENT; m++; m->m_title = "all agents off"; m->m_desc = "Disable quality agent on all hosts for this collection"; m->m_cgi = "aqad"; m->m_def = "0"; m->m_off = (char *)&cr.m_qualityAgentEnabled - x; m->m_type = TYPE_BOOL2; // no yes or no, just a link m++; m->m_title = "quality agent enabled"; m->m_desc = "If enabled, the agent will find quality modifiers for " "all of the sites found in titledb."; m->m_cgi = "qae"; m->m_off = (char *)&cr.m_qualityAgentEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_cast = 0; m->m_page = PAGE_QAGENT; m++; m->m_title = "quality agent continuous loop"; m->m_desc = "If enabled, the agent will loop when it reaches " "the end of titledb. Otherwise, it will disable itself."; m->m_cgi = "qale"; m->m_off = (char *)&cr.m_qualityAgentLoop - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_QAGENT; m->m_cast = 1; m++; m->m_title = "ban subsites"; m->m_desc = "If enabled, the agent will look at the paths of" " its titlerec sample, if the offending spam scores" " all come from the same subsite, we just ban that one." " Good for banning hijacked forums or spammed archives."; m->m_cgi = "qabs"; m->m_off = (char *)&cr.m_qualityAgentBanSubSites - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_QAGENT; m->m_cast = 1; m++; m->m_title = "start document"; m->m_desc = "The agent will start at this docid when scanning " "titledb looking for sites."; m->m_cgi = "qasd"; m->m_off = (char *)&cr.m_qualityAgentStartDoc - x; m->m_type = TYPE_LONG_LONG; m->m_def = "0"; m->m_cast = 1; m->m_page = PAGE_QAGENT; m->m_sync = false; // do not sync this parm m++; m->m_title = "site quality refresh rate"; m->m_desc = "The quality agent will try to reexamine entries in " "tagdb which were added more than this many seconds ago"; m->m_cgi = "qasqrr"; m->m_off = (char *)&cr.m_tagdbRefreshRate - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_group = 1; m->m_cast = 1; m->m_def = "2592000"; m++; m->m_title = "link samples to get"; m->m_desc = "Lookup the qualities of this many links in tagdb."; m->m_cgi = "lstg"; m->m_off = (char *)&cr.m_linkSamplesToGet - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "256"; m++; m->m_title = "min pages to evaluate"; m->m_desc = "The quality agent will skip this site if there are" " less than this many pages to evaluate."; m->m_cgi = "mpte"; m->m_off = (char *)&cr.m_minPagesToEvaluate - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "1"; m++; m->m_title = "link bonus divisor"; m->m_desc = "Decrease a page's spam score if it has a high " "link quality. The bonus is computed by dividing the " "page's link quality by this parm. LinkInfos older " "than 30 days are considered stale and are not used."; m->m_cgi = "lbd"; m->m_off = (char *)&cr.m_linkBonusDivisor - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "20"; m++; m->m_title = "points per banned link"; m->m_desc = "Subtract x points per banned site that a site links to."; m->m_cgi = "nppbl"; m->m_off = (char *)&cr.m_negPointsPerBannedLink - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "3"; m++; m->m_title = "points per link to different sites on the same IP"; m->m_desc = "Subtract x points per site linked to that is on the " "same IP as other links. Good for catching domain parking " "lots and spammers in general, but looking up the IPs " "slows down the agent considerably. (set to 0 to disable.)"; m->m_cgi = "pfltdssi"; m->m_off = (char *)&cr.m_penaltyForLinksToDifferentSiteSameIp - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "0"; m++; m->m_title = "number of sites on an ip to sample"; m->m_desc = "Examine this many sites on the same ip as this site"; m->m_cgi = "nsoits"; m->m_off = (char *)&cr.m_numSitesOnIpToSample - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "100"; m++; m->m_title = "points per banned site on ip"; m->m_desc = "Subtract x points from a site quality for each banned " "site on the ip"; m->m_cgi = "nppbsoi"; m->m_off = (char *)&cr.m_negPointsPerBannedSiteOnIp - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "2"; m++; m->m_title = "max penalty from being on a bad IP"; m->m_desc = "The penalty for being on a bad IP will not" " exceed this value."; m->m_cgi = "qampfboabi"; m->m_off = (char *)&cr.m_maxPenaltyFromIp - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "-30"; m++; m->m_title = "max sites per second"; m->m_desc = "The agent will not process more than this many" " sites per second. Can be less than 1."; m->m_cgi = "msps"; m->m_off = (char *)&cr.m_maxSitesPerSecond - x; m->m_type = TYPE_FLOAT; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "99999.0"; m++; m->m_title = "site agent banned ruleset"; m->m_desc = "Site agent will assign this ruleset to documents " " which are determined to be low quality."; m->m_cgi = ""; m->m_off = (char *)&cr.m_qualityAgentBanRuleset - x; m->m_type = TYPE_RULESET; // int32_t with dropdown of rulesets m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "30"; m++; m->m_title = "ban quality threshold"; m->m_desc = "If the site has a spam score greater than this parm, it will" " be inserted into the above ruleset."; m->m_cgi = "tttsb"; m->m_off = (char *)&cr.m_siteQualityBanThreshold - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "-100"; m++; m->m_title = "threshold to trigger site reindex"; m->m_desc = "If the site has a quality less than this parm, it will" " be added to the spider queue for reindexing"; m->m_cgi = "tttsr"; m->m_off = (char *)&cr.m_siteQualityReindexThreshold - x; m->m_type = TYPE_LONG; m->m_page = PAGE_QAGENT; m->m_cast = 1; m->m_def = "-100"; m++; // m->m_title = ""; // m->m_desc = ""; // m->m_cgi = ""; // m->m_off = (char *)&cr.m_ - x; // m->m_type = TYPE_LONG; // m->m_page = PAGE_QAGENT; // m->m_def = ""; // m++; */ /////////////////////////////////////////// // END QUALITY AGENT CONTROLS /////////////////////////////////////////// /////////////////////////////////////////// // AD FEED CONTROLS /////////////////////////////////////////// /* m->m_title = "num ads in paid inclusion ad feed"; m->m_desc = "The number of ads we would like returned from the ad" " server. This applies to all paid inclusion ads below."; m->m_cgi = "apin"; m->m_off = (char *)&cr.m_adPINumAds - x; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_page = PAGE_ADFEED; m++; m->m_title = "num ads in skyscraper ad feed"; m->m_desc = "The number of ads we would like returned from the ad" " server. This applies to all skyscraper ads below."; m->m_cgi = "assn"; m->m_off = (char *)&cr.m_adSSNumAds - x; m->m_type = TYPE_LONG; m->m_def = "5"; m->m_page = PAGE_ADFEED; m++; m->m_title = "skyscraper ad width"; m->m_desc = "The width of the skyscraper ad column in pixels"; m->m_cgi = "awd"; m->m_off = (char *)&cr.m_adWidth - x; m->m_type = TYPE_LONG; m->m_def = "300"; m->m_page = PAGE_ADFEED; m++; m->m_title = "ad feed timeout"; m->m_desc = "The time (in milliseconds) to wait for an ad list to be " "returned before timing out and displaying the results " "without any ads. This applies to all ads below."; m->m_cgi = "afto"; m->m_off = (char *)&cr.m_adFeedTimeOut - x; m->m_type = TYPE_LONG; m->m_def = "1000"; m->m_page = PAGE_ADFEED; m->m_group = 0; m++; m->m_title = "(1) paid inclusion ad enable"; m->m_desc = "Enable/Disable the paid inclusion ad."; m->m_cgi = "apie"; m->m_off = (char *)&cr.m_adPIEnable - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_ADFEED; m->m_def = "1"; m++; m->m_title = "(1) paid inclusion ad feed link"; m->m_desc = "Full link with address and parameters to retrieve an ad " "feed. To specify parameter input: %q for query, %n " "for num results, %p for page number, %i for query ip, " "and %% for %."; m->m_cgi = "apicgi"; m->m_off = (char *)cr.m_adCGI[0] - x; m->m_type = TYPE_STRING; m->m_size = MAX_CGI_URL; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion ad feed xml result tag"; m->m_desc = "Specify the full xml path for a result."; m->m_cgi = "apirx"; m->m_off = (char *)cr.m_adResultXml[0] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion ad feed xml title tag"; m->m_desc = "Specify the full xml path for the results title."; m->m_cgi = "apitx"; m->m_off = (char *)cr.m_adTitleXml[0] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion ad feed xml description tag"; m->m_desc = "Specify the full xml path for the results description."; m->m_cgi = "apidx"; m->m_off = (char *)cr.m_adDescXml[0] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion ad feed xml link tag"; m->m_desc = "Specify the full xml path for the results link. This " "is the link that is shown as plain text, not an actual " "link, below the ad description."; m->m_cgi = "apilx"; m->m_off = (char *)cr.m_adLinkXml[0] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion ad feed xml url tag"; m->m_desc = "Specify the full xml path for the results url. This is " "the link associated with the title."; m->m_cgi = "apiux"; m->m_off = (char *)cr.m_adUrlXml[0] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion backup ad feed link"; m->m_desc = "Full link with address and parameters to retrieve an ad " "feed. To specify parameter input: %q for query, %n " "for num results, %p for page number, %i for query ip, " "and %% for %."; m->m_cgi = "apicgib"; m->m_off = (char *)cr.m_adCGI[1] - x; m->m_type = TYPE_STRING; m->m_size = MAX_CGI_URL; m->m_page = PAGE_ADFEED; m->m_def = ""; m++; m->m_title = "(1) paid inclusion backup ad feed xml result tag"; m->m_desc = "Specify the full xml path for a result."; m->m_cgi = "apirxb"; m->m_off = (char *)cr.m_adResultXml[1] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion backup ad feed xml title tag"; m->m_desc = "Specify the full xml path for the results title."; m->m_cgi = "apitxb"; m->m_off = (char *)cr.m_adTitleXml[1] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion backup ad feed xml description tag"; m->m_desc = "Specify the full xml path for the results description."; m->m_cgi = "apidxb"; m->m_off = (char *)cr.m_adDescXml[1] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion backup ad feed xml link tag"; m->m_desc = "Specify the full xml path for the results link. This " "is the link that is shown as plain text, not an actual " "link, below the ad description."; m->m_cgi = "apilxb"; m->m_off = (char *)cr.m_adLinkXml[1] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion backup ad feed xml url tag"; m->m_desc = "Specify the full xml path for the results url. This is " "the link associated with the title."; m->m_cgi = "apiuxb"; m->m_off = (char *)cr.m_adUrlXml[1] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) paid inclusion format text"; m->m_desc = "Specify the formatting text from the
m_cgi = "apift"; m->m_off = (char *)cr.m_adPIFormat - x; m->m_plen = (char *)&cr.m_adPIFormatLen - x; // length of string m->m_type = TYPE_STRINGBOX; m->m_size = MAX_HTML_LEN + 1; m->m_page = PAGE_ADFEED; m->m_def = "style=\"padding: 3px;" "text-align: left; background-color: " "lightyellow;\">Sponsored Results\n" "

"; m->m_group = 0; m++; m->m_title = "(1) skyscraper ad enable"; m->m_desc = "Enable/Disable the skyscraper ad."; m->m_cgi = "asse"; m->m_off = (char *)&cr.m_adSSEnable - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_ADFEED; m->m_def = "1"; m++; m->m_title = "(1) skyscraper ad feed same as paid inclusion"; m->m_desc = "Use the same feed CGI as used above for the paid " "inclusion."; m->m_cgi = "asssap"; m->m_off = (char *)&cr.m_adSSSameasPI - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_ADFEED; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "(1) skyscraper ad feed link"; m->m_desc = "Full link with address and parameters to retrieve an ad " "feed. To specify parameter input: %q for query, %n " "for num results, %p for page number, %i for query ip, " "and %% for %."; m->m_cgi = "asscgi"; m->m_off = (char *)cr.m_adCGI[2] - x; m->m_type = TYPE_STRING; m->m_size = MAX_CGI_URL; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper ad feed xml result tag"; m->m_desc = "Specify the full xml path for a result."; m->m_cgi = "assrx"; m->m_off = (char *)cr.m_adResultXml[2] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper ad feed xml title tag"; m->m_desc = "Specify the full xml path for the results title."; m->m_cgi = "asstx"; m->m_off = (char *)cr.m_adTitleXml[2] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper ad feed xml description tag"; m->m_desc = "Specify the full xml path for the results description."; m->m_cgi = "assdx"; m->m_off = (char *)cr.m_adDescXml[2] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper ad feed xml link tag"; m->m_desc = "Specify the full xml path for the results link. This " "is the link that is shown as plain text, not an actual " "link, below the ad description."; m->m_cgi = "asslx"; m->m_off = (char *)cr.m_adLinkXml[2] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper ad feed xml url tag"; m->m_desc = "Specify the full xml path for the results url. This is " "the link associated with the title."; m->m_cgi = "assux"; m->m_off = (char *)cr.m_adUrlXml[2] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper backup ad feed same as paid inclusion"; m->m_desc = "Use the same feed CGI as used above for the backup paid " "inclusion."; m->m_cgi = "asssapb"; m->m_off = (char *)&cr.m_adBSSSameasBPI - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_ADFEED; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "(1) skyscraper backup ad feed link"; m->m_desc = "Full link with address and parameters to retrieve an ad " "feed. To specify parameter input: %q for query, %n " "for num results, %p for page number, %i for query ip, " "and %% for %."; m->m_cgi = "asscgib"; m->m_off = (char *)cr.m_adCGI[3] - x; m->m_type = TYPE_STRING; m->m_size = MAX_CGI_URL; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper backup ad feed xml result tag"; m->m_desc = "Specify the full xml path for a result."; m->m_cgi = "assrxb"; m->m_off = (char *)cr.m_adResultXml[3] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper backup ad feed xml title tag"; m->m_desc = "Specify the full xml path for the results title."; m->m_cgi = "asstxb"; m->m_off = (char *)cr.m_adTitleXml[3] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper backup ad feed xml description tag"; m->m_desc = "Specify the full xml path for the results description."; m->m_cgi = "assdxb"; m->m_off = (char *)cr.m_adDescXml[3] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper backup ad feed xml link tag"; m->m_desc = "Specify the full xml path for the results link. This " "is the link that is shown as plain text, not an actual " "link, below the ad description."; m->m_cgi = "asslxb"; m->m_off = (char *)cr.m_adLinkXml[3] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper backup ad feed xml url tag"; m->m_desc = "Specify the full xml path for the results url. This is " "the link associated with the title."; m->m_cgi = "assuxb"; m->m_off = (char *)cr.m_adUrlXml[3] - x; m->m_type = TYPE_STRING; m->m_size = MAX_XML_LEN; m->m_page = PAGE_ADFEED; m->m_def = ""; m->m_group = 0; m++; m->m_title = "(1) skyscraper format text"; m->m_desc = "Specify the formatting text from the
m_cgi = "assft"; m->m_off = (char *)cr.m_adSSFormat - x; m->m_plen = (char *)&cr.m_adSSFormatLen - x; // length of string m->m_size = MAX_HTML_LEN + 1; m->m_type = TYPE_STRINGBOX; m->m_page = PAGE_ADFEED; m->m_def = "style=\"height: 100%; padding: 3px;" "text-align: center;background-color: " "lightyellow;\">" "Sponsored Results

"; m->m_group = 0; m++; */ /////////////////////////////////////////// // END AD FEED CONTROLS /////////////////////////////////////////// /////////////////////////////////////////// // SEARCH URL CONTROLS // these are only specified in the search url when doing a search /////////////////////////////////////////// ///// // // OLDER SEARCH INPUTS // //// // when we do &qa=1 we do not show things like responseTime in // search results so we can verify serp checksum consistency for QA // in qa.cpp /* m->m_title = "quality assurance"; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_SI; m->m_desc = "This is 1 if doing a QA test in qa.cpp"; m->m_def = "0"; m->m_soff = (char *)&si.m_qa - y; m->m_type = TYPE_CHAR; m->m_sparm = 1; m->m_scgi = "qa"; m++; */ //m->m_title = "show turk forms"; //m->m_desc = "If enabled summaries in search results will be " // "turkable input forms."; //m->m_def = "0"; //m->m_soff = (char *)&si.m_getTurkForm - y; //m->m_type = TYPE_BOOL; //m->m_sparm = 1; //m->m_scgi = "turk"; //m++; // IMPORT PARMS m->m_title = "enable document importation"; m->m_desc = "Import documents into this collection."; m->m_cgi = "import"; m->m_page = PAGE_IMPORT; m->m_obj = OBJ_COLL; m->m_off = (char *)&cr.m_importEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_API; m++; // m->m_title = "collection"; // m->m_desc = "Collection to import documents into."; // m->m_cgi = "c"; // m->m_page = PAGE_IMPORT; // m->m_obj = OBJ_GBREQUEST; // m->m_off = (char *)&cr.m_imcoll - (char *)&gr; // m->m_type = TYPE_CHARPTR; // m->m_def = NULL; // // PF_COLLDEFAULT: so it gets set to default coll on html page // m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML; // m++; m->m_title = "directory containing titledb files"; m->m_desc = "Import documents contained in titledb files in this " "directory. This is an ABSOLUTE directory path."; m->m_cgi = "importdir"; m->m_xml = "importDir"; m->m_page = PAGE_IMPORT; m->m_obj = OBJ_COLL; m->m_off = (char *)&cr.m_importDir - x; m->m_type = TYPE_SAFEBUF; m->m_def = ""; m->m_flags = PF_API; m++; m->m_title = "number of simultaneous injections"; m->m_desc = "Typically try one or two injections per host in " "your cluster."; m->m_cgi = "numimportinjects"; m->m_xml = "numImportInjects"; m->m_page = PAGE_IMPORT; m->m_obj = OBJ_COLL; m->m_off = (char *)&cr.m_numImportInjects - x; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_flags = PF_API; m++; /////////// // // ADD URL PARMS // /////////// m->m_title = "collection"; m->m_desc = "Add urls into this collection."; m->m_cgi = "c"; m->m_page = PAGE_ADDURL2; m->m_obj = OBJ_GBREQUEST; m->m_off = (char *)&gr.m_coll - (char *)&gr; m->m_type = TYPE_CHARPTR; m->m_def = NULL; // PF_COLLDEFAULT: so it gets set to default coll on html page m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML; m++; m->m_title = "urls to add"; m->m_desc = "List of urls to index. One per line or space separated. " "If your url does not index as you expect you " "can check it's spider history by doing a url: search on it. " "Added urls will have a " "hopcount of 0. " "Added urls will match the " "isaddurl directive on " "the url filters page. " "The add url api is described on the " "api page."; m->m_cgi = "urls"; m->m_page = PAGE_ADDURL2; m->m_obj = OBJ_GBREQUEST; // do not store in g_conf or collectionrec m->m_off = (char *)&gr.m_urlsBuf - (char *)&gr; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_TEXTAREA | PF_NOSAVE | PF_API|PF_REQUIRED; m++; /* // the new upload post submit button m->m_title = "upload urls"; m->m_desc = "Upload your file of urls."; m->m_cgi = "urls"; m->m_page = PAGE_ADDURL2; m->m_obj = OBJ_NONE; m->m_def = NULL; m->m_type = TYPE_FILEUPLOADBUTTON; m++; */ m->m_title = "strip sessionids"; m->m_desc = "Strip added urls of their session ids."; m->m_cgi = "strip"; m->m_page = PAGE_ADDURL2; m->m_obj = OBJ_GBREQUEST; m->m_off = (char *)&gr.m_stripBox - (char *)&gr; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_flags = PF_API; m++; m->m_title = "harvest links"; m->m_desc = "Harvest links of added urls so we can spider them?."; m->m_cgi = "spiderlinks"; m->m_page = PAGE_ADDURL2; m->m_obj = OBJ_GBREQUEST; m->m_off = (char *)&gr.m_harvestLinks - (char *)&gr; m->m_type = TYPE_CHECKBOX; m->m_def = "1"; m->m_flags = PF_API; m++; /* m->m_title = "force respider"; m->m_desc = "Force an immediate respider even if the url " "is already indexed."; m->m_cgi = "force"; m->m_page = PAGE_ADDURL2; m->m_obj = OBJ_GBREQUEST; m->m_off = (char *)&gr.m_forceRespiderBox - (char *)&gr; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m++; */ //////// // // now the new injection parms // //////// m->m_title = "url"; m->m_desc = "Specify the URL that will be immediately crawled " "and indexed in real time while you wait. The browser " "will return the " "final index status code. Alternatively, " "use the add url page " "to add urls individually or in bulk " "without having to wait for the pages to be " "actually indexed in realtime. " "By default, injected urls " "take precedence over the \"insitelist\" expression in the " "url filters " "so injected urls need not match the patterns in your " "site list. You can " "change that behavior in the url " "filters if you want. " "Injected urls will have a " "hopcount of 0. " "The injection api is described on the " "api page. " "Make up a fake url if you are injecting content that " "does not have one." "
" "
" "If the url ends in .warc or .arc or .warc.gz or .arc.gz " "Gigablast will index the contained documents as individual " "documents, using the appropriate dates and other meta " "information contained in the containing archive file." ; m->m_cgi = "url"; //m->m_cgi2 = "u"; //m->m_cgi3 = "seed"; // pagerawlbot //m->m_cgi4 = "injecturl"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_API | PF_REQUIRED; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_url - (char *)&ir; m++; // alias #1 m->m_title = "url"; m->m_cgi = "u"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_HIDDEN; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_url - (char *)&ir; m++; // alias #2 m->m_title = "url"; m->m_cgi = "seed"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_HIDDEN | PF_DIFFBOT; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_url - (char *)&ir; m++; // alias #3 m->m_title = "url"; m->m_cgi = "injecturl"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_HIDDEN; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_url - (char *)&ir; m++; m->m_title = "query to scrape"; m->m_desc = "Scrape popular search engines for this query " "and inject their links. You are not required to supply " "the url parm if you supply this parm."; m->m_cgi = "qts"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_queryToScrape - (char *)&ir; m++; m->m_title = "inject links"; m->m_desc = "Should we inject the links found in the injected " "content as well?"; m->m_cgi = "injectlinks"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_injectLinks - (char *)&ir; m++; m->m_title = "spider links"; m->m_desc = "Add the outlinks of the injected content into spiderdb " "for spidering?"; m->m_cgi = "spiderlinks"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; // leave off because could start spidering whole web unintentionally m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_spiderLinks - (char *)&ir; m++; m->m_title = "short reply"; m->m_desc = "Should the injection response be short and simple?"; m->m_cgi = "quick"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_HIDDEN; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_shortReply - (char *)&ir; m++; m->m_title = "only inject content if new"; m->m_desc = "If the specified url is already in the index then " "skip the injection."; m->m_cgi = "newonly"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_newOnly - (char *)&ir; m++; m->m_title = "delete from index"; m->m_desc = "Delete the specified url from the index."; m->m_cgi = "deleteurl"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_deleteUrl - (char *)&ir; m++; m->m_title = "recycle content"; m->m_desc = "If the url is already in the index, then do not " "re-download the content, just use the content that was " "stored in the cache from last time."; m->m_cgi = "recycle"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_recycle - (char *)&ir; m++; m->m_title = "dedup url"; m->m_desc = "Do not index the url if there is already another " "url in the index with the same content."; m->m_cgi = "dedup"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_dedup - (char *)&ir; m++; m->m_title = "do consistency checking"; m->m_desc = "Turn this on for debugging."; m->m_cgi = "consist"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_HIDDEN; // | PF_API m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_doConsistencyTesting - (char *)&ir; m++; m->m_title = "hop count"; m->m_desc = "Use this hop count when injecting the page."; m->m_cgi = "hopcount"; m->m_obj = OBJ_IR; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_flags = PF_HIDDEN; // | PF_API m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_hopCount - (char *)&ir; m++; m->m_title = "url IP"; m->m_desc = "Use this IP when injecting the document. Do not use or " "set to 0.0.0.0, if unknown. If provided, it will save an IP " "lookup."; m->m_cgi = "urlip"; m->m_obj = OBJ_IR; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_injectDocIp - (char *)&ir; m++; m->m_title = "last spider time"; m->m_desc = "Override last time spidered"; m->m_cgi = "lastspidered"; m->m_obj = OBJ_IR; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_flags = PF_HIDDEN; // | PF_API m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_lastSpidered - (char *)&ir; m++; m->m_title = "first indexed"; m->m_desc = "Override first indexed time"; m->m_cgi = "firstindexed"; m->m_obj = OBJ_IR; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_flags = PF_HIDDEN; // | PF_API m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_firstIndexed - (char *)&ir; m++; m->m_title = "content has mime"; m->m_desc = "If the content of the url is provided below, does " "it begin with an HTTP mime header?"; m->m_cgi = "hasmime"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_hasMime - (char *)&ir; m++; m->m_title = "content delimeter"; m->m_desc = "If the content of the url is provided below, then " "it consist of multiple documents separated by this " "delimeter. Each such item will be injected as an " "independent document. Some possible delimiters: " "======== or <doc>. If you set " "hasmime above to true then Gigablast will check " "for a url after the delimeter and use that url as the " "injected url. Otherwise it will append numbers to the " "url you provide above."; m->m_cgi = "delim"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_contentDelim - (char *)&ir; m++; m->m_title = "content type"; m->m_desc = "If you supply content in the text box below without " "an HTTP mime header, " "then you need to enter the content type. " "Possible values: text/html text/plain text/xml " "application/json"; m->m_cgi = "contenttype"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; //text/html application/json application/xml m->m_def = "text/html"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_contentTypeStr - (char *)&ir; m++; m->m_title = "content charset"; m->m_desc = "A number representing the charset of the content " "if provided below and no HTTP mime header " "is given. Defaults to utf8 " "which is 106. " "See iana_charset.h for the numeric values."; m->m_cgi = "charset"; m->m_obj = OBJ_IR; m->m_type = TYPE_LONG; m->m_def = "106"; m->m_flags = PF_API; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_charset - (char *)&ir; m++; m->m_title = "upload content file"; m->m_desc = "Instead of specifying the content to be injected in " "the text box below, upload this file for it."; m->m_cgi = "file"; m->m_obj = OBJ_IR; m->m_type = TYPE_FILEUPLOADBUTTON; m->m_def = NULL; m->m_flags = PF_NOAPI; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_contentFile - (char *)&ir; m++; m->m_title = "content"; m->m_desc = "If you want to supply the URL's content " "rather than have Gigablast download it, then " "enter the content here. " "Enter MIME header " "first if \"content has mime\" is set to true above. " "Separate MIME from actual content with two returns. " "At least put a single space in here if you want to " "inject empty content, otherwise the content will " "be downloaded from the url. This is because the " "page injection form always submits the content text area " "even if it is empty, which should signify that the " "content should be downloaded."; m->m_cgi = "content"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_API|PF_TEXTAREA; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_content - (char *)&ir; m++; m->m_title = "metadata"; m->m_desc = "Json encoded metadata to be indexed with the document."; m->m_cgi = "metadata"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_API|PF_TEXTAREA; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_metadata - (char *)&ir; m++; m->m_title = "get sectiondb voting info"; m->m_desc = "Return section information of injected content for " "the injected subdomain. "; m->m_cgi = "sections"; m->m_obj = OBJ_IR; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_API|PF_NOHTML; m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.m_getSections - (char *)&ir; m++; m->m_title = "diffbot reply"; m->m_desc = "Used exclusively by diffbot. Do not use."; m->m_cgi = "diffbotreply"; m->m_obj = OBJ_IR; m->m_type = TYPE_CHARPTR; m->m_def = NULL; m->m_flags = PF_API|PF_TEXTAREA|PF_NOHTML; // do not show in our api m->m_page = PAGE_INJECT; m->m_off = (char *)&ir.ptr_diffbotReply - (char *)&ir; m++; /////////////////// // // QUERY REINDEX // /////////////////// m->m_title = "collection"; m->m_desc = "query reindex in this collection."; m->m_cgi = "c"; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR; m->m_def = NULL; // PF_COLLDEFAULT: so it gets set to default coll on html page m->m_flags = PF_API|PF_REQUIRED|PF_NOHTML; m->m_page = PAGE_REINDEX; m->m_off = (char *)&gr.m_coll - (char *)&gr; m++; m->m_title = "query to reindex or delete"; m->m_desc = "We either reindex or delete the search results of " "this query. Reindexing them will redownload them and " "possible update the siterank, which is based on the " "number of links to the site. This will add the url " "requests to " "the spider queue so ensure your spiders are enabled."; m->m_cgi = "q"; m->m_off = (char *)&gr.m_query - (char *)&gr; m->m_type = TYPE_CHARPTR; m->m_page = PAGE_REINDEX; m->m_obj = OBJ_GBREQUEST; m->m_def = NULL; m->m_flags = PF_API |PF_REQUIRED; m++; m->m_title = "start result number"; m->m_desc = "Starting with this result #. Starts at 0."; m->m_cgi = "srn"; m->m_off = (char *)&gr.m_srn - (char *)&gr; m->m_type = TYPE_LONG; m->m_page = PAGE_REINDEX; m->m_obj = OBJ_GBREQUEST; m->m_def = "0"; m->m_flags = PF_API ; m++; m->m_title = "end result number"; m->m_desc = "Ending with this result #. 0 is the first result #."; m->m_cgi = "ern"; m->m_off = (char *)&gr.m_ern - (char *)&gr; m->m_type = TYPE_LONG; m->m_page = PAGE_REINDEX; m->m_obj = OBJ_GBREQUEST; m->m_def = "99999999"; m->m_flags = PF_API ; m++; m->m_title = "query language"; m->m_desc = "The language the query is in. Used to rank results. " "Just use xx to indicate no language in particular. But " "you should use the same qlang value you used for doing " "the query if you want consistency."; m->m_cgi = "qlang"; m->m_off = (char *)&gr.m_qlang - (char *)&gr; m->m_type = TYPE_CHARPTR; m->m_page = PAGE_REINDEX; m->m_obj = OBJ_GBREQUEST; m->m_def = "en"; m->m_flags = PF_API ; m++; m->m_title = "recycle content"; m->m_desc = "If you check this box then Gigablast will not " "re-download the content, but use the content that was " "stored in the cache from last time. Useful for rebuilding " "the index to pick up new inlink text or fresher " "sitenuminlinks counts which influence ranking."; m->m_cgi = "qrecycle"; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHECKBOX; m->m_def = "0"; m->m_flags = PF_API; m->m_page = PAGE_REINDEX; m->m_off = (char *)&gr.m_recycleContent - (char *)&gr; m++; m->m_title = "FORCE DELETE"; m->m_desc = "Check this checkbox to delete the results, not just " "reindex them."; m->m_cgi = "forcedel"; m->m_off = (char *)&gr.m_forceDel - (char *)&gr; m->m_type = TYPE_CHECKBOX; m->m_page = PAGE_REINDEX; m->m_obj = OBJ_GBREQUEST; m->m_def = "0"; m->m_flags = PF_API ; m++; /////////////////// // // SEARCH CONTROLS // /////////////////// m->m_title = "do spell checking by default"; m->m_desc = "If enabled while using the XML feed, " "when Gigablast finds a spelling recommendation it will be " "included in the XML tag. Default is 0 if using an " "XML feed, 1 otherwise."; m->m_cgi = "spell"; m->m_off = (char *)&cr.m_spellCheck - x; //m->m_soff = (char *)&si.m_spellCheck - y; //m->m_sparm = 1; m->m_type = TYPE_BOOL; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_def = "1"; m->m_flags = PF_API | PF_NOSAVE | PF_CLONE; m++; m->m_title = "get scoring info by default"; m->m_desc = "Get scoring information for each result so you " "can see how each result is scored. You must explicitly " "request this using &scores=1 for the XML feed because it " "is not included by default."; m->m_cgi = "scores"; // dedupResultsByDefault"; m->m_off = (char *)&cr.m_getDocIdScoringInfo - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_def = "1"; m->m_flags = PF_API | PF_CLONE; m++; m->m_title = "do query expansion by default"; m->m_desc = "If enabled, query expansion will expand your query " "to include the various forms and " "synonyms of the query terms."; m->m_def = "1"; m->m_off = (char *)&cr.m_queryExpansion - x; m->m_type = TYPE_BOOL; m->m_cgi = "qe"; m->m_page = PAGE_SEARCH; m->m_flags = PF_API | PF_CLONE; m->m_obj = OBJ_COLL; m++; m->m_title = "highlight query terms in summaries by default"; m->m_desc = "Use to disable or enable " "highlighting of the query terms in the summaries."; m->m_def = "1"; m->m_off = (char *)&cr.m_doQueryHighlighting - x; m->m_type = TYPE_BOOL; m->m_cgi = "qh"; m->m_smin = 0; m->m_smax = 8; m->m_sprpg = 1; // turn off for now m->m_sprpp = 1; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max title len"; m->m_desc = "What is the maximum number of " "characters allowed in titles displayed in the search " "results?"; m->m_cgi = "tml"; m->m_off = (char *)&cr.m_titleMaxLen - x; m->m_type = TYPE_LONG; m->m_flags = PF_API | PF_CLONE; m->m_def = "80"; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "consider titles from body"; m->m_desc = "Can Gigablast make titles from the document content? " "Used mostly for the news collection where the title tags " "are not very reliable."; m->m_cgi = "gtfb"; m->m_off = (char *)&cr.m_considerTitlesFromBody - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; //m->m_soff = (char *)&si.m_considerTitlesFromBody - y; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "site cluster by default"; m->m_desc = "Should search results be site clustered? This " "limits each site to appearing at most twice in the " "search results. Sites are subdomains for the most part, " "like abc.xyz.com."; m->m_cgi = "scd"; m->m_off = (char *)&cr.m_siteClusterByDefault - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; // buzz m->m_title = "hide all clustered results"; m->m_desc = "Only display at most one result per site."; m->m_cgi = "hacr"; m->m_off = (char *)&cr.m_hideAllClustered - x; m->m_type = TYPE_BOOL; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m++; m->m_title = "dedup results by default"; m->m_desc = "Should duplicate search results be removed? This is " "based on a content hash of the entire document. " "So documents must be exactly the same for the most part."; m->m_cgi = "drd"; // dedupResultsByDefault"; m->m_off = (char *)&cr.m_dedupResultsByDefault - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 1; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "do tagdb lookups for queries"; m->m_desc = "For each search result a tagdb lookup is made, " "usually across the network on distributed clusters, to " "see if the URL's site has been manually banned in tagdb. " "If you don't manually ban sites then turn this off for " "extra speed."; m->m_cgi = "stgdbl"; m->m_off = (char *)&cr.m_doTagdbLookups - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 1; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "percent similar dedup summary default value"; m->m_desc = "If document summary (and title) are " "this percent similar " "to a document summary above it, then remove it from the " "search results. 100 means only to remove if exactly the " "same. 0 means no summary deduping."; m->m_cgi = "psds"; m->m_off = (char *)&cr.m_percentSimilarSummary - x; m->m_type = TYPE_LONG; m->m_def = "90"; m->m_group = 0; m->m_smin = 0; m->m_smax = 100; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of lines to use in summary to dedup"; m->m_desc = "Sets the number of lines to generate for summary " "deduping. This is to help the deduping process not throw " "out valid summaries when normally displayed summaries are " "smaller values. Requires percent similar dedup summary to " "be non-zero."; m->m_cgi = "msld"; m->m_off = (char *)&cr.m_summDedupNumLines - x; m->m_type = TYPE_LONG; m->m_def = "4"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "dedup URLs by default"; m->m_desc = "Should we dedup URLs with case insensitivity? This is " "mainly to correct duplicate wiki pages."; m->m_cgi = "ddu"; m->m_off = (char *)&cr.m_dedupURLDefault - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "use vhost language detection"; m->m_desc = "Use language specific pages for home, etc."; m->m_cgi = "vhost"; m->m_off = (char *)&cr.m_useLanguagePages - x; //m->m_soff = (char *)&si.m_useLanguagePages - y; m->m_type = TYPE_BOOL; m->m_def = "1"; //m->m_scgi = "vhost"; m->m_smin = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "sort language preference default"; m->m_desc = "Default language to use for ranking results. " //"This should only be used on limited collections. " "Value should be any language abbreviation, for example " "\"en\" for English. Use xx to give ranking " "boosts to no language in particular. See the language " "abbreviations at the bottom of the " "url filters page."; m->m_cgi = "defqlang"; m->m_off = (char *)&cr.m_defaultSortLanguage2 - x; m->m_type = TYPE_STRING; m->m_size = 6; // up to 5 chars + NULL, e.g. "en_US" m->m_def = "xx";//_US"; //m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "sort country preference default"; m->m_desc = "Default country to use for ranking results. " //"This should only be used on limited collections. " "Value should be any country code abbreviation, for example " "\"us\" for United States. This is currently not working."; m->m_cgi = "qcountry"; m->m_off = (char *)&cr.m_defaultSortCountry - x; m->m_type = TYPE_STRING; m->m_size = 2+1; m->m_def = "us"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; // for post query reranking m->m_title = "docs to check for post query demotion by default"; m->m_desc = "How many search results should we " "scan for post query demotion? " "0 disables all post query reranking. "; m->m_cgi = "pqrds"; m->m_off = (char *)&cr.m_pqr_docsToScan - x; //m->m_soff = (char *)&si.m_docsToScanForReranking - y; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 1; //m->m_scgi = "pqrds"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max summary len"; m->m_desc = "What is the maximum number of " "characters displayed in a summary for a search result?"; m->m_cgi = "sml"; m->m_off = (char *)&cr.m_summaryMaxLen - x; m->m_type = TYPE_LONG; m->m_def = "512"; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max summary excerpts"; m->m_desc = "What is the maximum number of " "excerpts displayed in the summary of a search result?"; m->m_cgi = "smnl"; m->m_off = (char *)&cr.m_summaryMaxNumLines - x; m->m_type = TYPE_LONG; m->m_def = "4"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max summary excerpt length"; m->m_desc = "What is the maximum number of " "characters allowed per summary excerpt?"; m->m_cgi = "smxcpl"; m->m_off = (char *)&cr.m_summaryMaxNumCharsPerLine - x; m->m_type = TYPE_LONG; m->m_def = "90"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; /* m->m_title = "default number of summary excerpts by default"; m->m_desc = "What is the default number of " "summary excerpts displayed per search result?"; m->m_cgi = "sdnl"; m->m_off = (char *)&cr.m_summaryDefaultNumLines - x; m->m_type = TYPE_LONG; m->m_def = "3"; m->m_group = 0; m->m_flags = PF_API; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; */ m->m_title = "max summary line width by default"; m->m_desc = "<br> tags are inserted to keep the number " "of chars in the summary per line at or below this width. " "Also affects title. " "Strings without spaces that exceed this " "width are not split. Has no affect on xml or json feed, " "only works on html."; m->m_cgi = "smw"; m->m_off = (char *)&cr.m_summaryMaxWidth - x; m->m_type = TYPE_LONG; m->m_def = "80"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "bytes of doc to scan for summary generation"; m->m_desc = "Truncating this will miss out on good summaries, but " "performance will increase."; m->m_cgi = "clmfs"; m->m_off = (char *)&cr.m_contentLenMaxForSummary - x; m->m_type = TYPE_LONG; m->m_def = "70000"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "Prox summary carver radius"; m->m_desc = "Maximum number of characters to allow in between " "search terms."; m->m_cgi = "pscr"; m->m_off = (char *)&cr.m_proxCarveRadius - x; m->m_type = TYPE_LONG; m->m_def = "256"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "front highlight tag"; m->m_desc = "Front html tag used for highlightig query terms in the " "summaries displated in the search results."; m->m_cgi = "sfht"; m->m_off = (char *)cr.m_summaryFrontHighlightTag - x; m->m_type = TYPE_STRING; m->m_size = SUMMARYHIGHLIGHTTAGMAXSIZE ; m->m_def = ""; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "back highlight tag"; m->m_desc = "Front html tag used for highlightig query terms in the " "summaries displated in the search results."; m->m_cgi = "sbht"; m->m_off = (char *)cr.m_summaryBackHighlightTag - x; m->m_type = TYPE_STRING; m->m_size = SUMMARYHIGHLIGHTTAGMAXSIZE ; m->m_def = ""; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "results to scan for gigabits generation by default"; m->m_desc = "How many search results should we " "scan for gigabit (related topics) generation. Set this to " "zero to disable gigabits generation by default."; m->m_cgi = "dsrt"; m->m_off = (char *)&cr.m_docsToScanForTopics - x; m->m_type = TYPE_LONG; m->m_def = "30"; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "ip restriction for gigabits by default"; m->m_desc = "Should Gigablast only get one document per IP domain " "and per domain for gigabits (related topics) generation?"; m->m_cgi = "ipr"; m->m_off = (char *)&cr.m_ipRestrict - x; m->m_type = TYPE_BOOL; // default to 0 since newspaperarchive only has docs from same IP dom m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "remove overlapping topics"; m->m_desc = "Should Gigablast remove overlapping topics (gigabits)?"; m->m_cgi = "rot"; m->m_off = (char *)&cr.m_topicRemoveOverlaps - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "number of gigabits to show by default"; m->m_desc = "What is the number of " "related topics (gigabits) " "displayed per query? Set to 0 to save " "CPU time."; m->m_cgi = "nrt"; m->m_off = (char *)&cr.m_numTopics - x; m->m_type = TYPE_LONG; m->m_def = "11"; m->m_group = 0; m->m_sprpg = 0; // do not propagate m->m_sprpp = 0; // do not propagate m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "min gigabit score by default"; m->m_desc = "Gigabits (related topics) with scores below this " "will be excluded. Scores range from 0% to over 100%."; m->m_cgi = "mts"; m->m_off = (char *)&cr.m_minTopicScore - x; m->m_type = TYPE_LONG; m->m_def = "5"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "min gigabit doc count by default"; m->m_desc = "How many documents must contain the gigabit " "(related topic) in order for it to be displayed."; m->m_cgi = "mdc"; m->m_off = (char *)&cr.m_minDocCount - x; m->m_type = TYPE_LONG; m->m_def = "2"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "dedup doc percent for gigabits (related topics)"; m->m_desc = "If a document is this percent similar to another " "document with a higher score, then it will not contribute " "to the gigabit generation."; m->m_cgi = "dsp"; m->m_off = (char *)&cr.m_dedupSamplePercent - x; m->m_type = TYPE_LONG; m->m_def = "80"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "max words per gigabit (related topic) by default"; m->m_desc = "Maximum number of words a gigabit (related topic) " "can have. Affects xml feeds, too."; m->m_cgi = "mwpt"; m->m_off = (char *)&cr.m_maxWordsPerTopic - x; m->m_type = TYPE_LONG; m->m_def = "6"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "gigabit max sample size"; m->m_desc = "Max chars to sample from each doc for gigabits " "(related topics)."; m->m_cgi = "tmss"; m->m_off = (char *)&cr.m_topicSampleSize - x; m->m_type = TYPE_LONG; m->m_def = "4096"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "gigabit max punct len"; m->m_desc = "Max sequential punct chars allowed in a gigabit " "(related topic). " " Set to 1 for speed, 5 or more for best topics but twice as " "slow."; m->m_cgi = "tmpl"; m->m_off = (char *)&cr.m_topicMaxPunctLen - x; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "display dmoz categories in results"; m->m_desc = "If enabled, results in dmoz will display their " "categories on the results page."; m->m_cgi = "ddc"; m->m_off = (char *)&cr.m_displayDmozCategories - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "display indirect dmoz categories in results"; m->m_desc = "If enabled, results in dmoz will display their " "indirect categories on the results page."; m->m_cgi = "didc"; m->m_off = (char *)&cr.m_displayIndirectDmozCategories - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "display Search Category link to query category of result"; m->m_desc = "If enabled, a link will appear next to each category " "on each result allowing the user to perform their query " "on that entire category."; m->m_cgi = "dscl"; m->m_off = (char *)&cr.m_displaySearchCategoryLink - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "use dmoz for untitled"; m->m_desc = "Yes to use DMOZ given title when a page is untitled but " "is in DMOZ."; m->m_cgi = "udfu"; m->m_off = (char *)&cr.m_useDmozForUntitled - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "show dmoz summaries"; m->m_desc = "Yes to always show DMOZ summaries with search results " "that are in DMOZ."; m->m_cgi = "udsm"; m->m_off = (char *)&cr.m_showDmozSummary - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "show adult category on top"; m->m_desc = "Yes to display the Adult category in the Top category"; m->m_cgi = "sacot"; m->m_off = (char *)&cr.m_showAdultCategoryOnTop - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_API | PF_CLONE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; /* m->m_title = "show sensitive info in xml feed"; m->m_desc = "If enabled, we show certain tagb tags for each " "search result, allow &inlinks=1 cgi parms, show " ", etc. in the xml feed. Created for buzzlogic."; m->m_cgi = "sss"; m->m_off = (char *)&cr.m_showSensitiveStuff - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ m->m_title = "display indexed date"; m->m_desc = "Display the indexed date along with results."; m->m_cgi = "didt"; m->m_off = (char *)&cr.m_displayIndexedDate - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "display last modified date"; m->m_desc = "Display the last modified date along with results."; m->m_cgi = "dlmdt"; m->m_off = (char *)&cr.m_displayLastModDate - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "display published date"; m->m_desc = "Display the published date along with results."; m->m_cgi = "dipt"; m->m_off = (char *)&cr.m_displayPublishDate - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "enable click 'n' scroll"; m->m_desc = "The [cached] link on results pages loads click n " "scroll."; m->m_cgi = "ecns"; m->m_off = (char *)&cr.m_clickNScrollEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "use data feed account server"; m->m_desc = "Enable/disable the use of a remote account verification " "for Data Feed Customers."; m->m_cgi = "dfuas"; m->m_off = (char *)&cr.m_useDFAcctServer - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "data feed server ip"; m->m_desc = "The ip address of the Gigablast data feed server to " "retrieve customer account information from."; m->m_cgi = "dfip"; m->m_off = (char *)&cr.m_dfAcctIp - x; m->m_type = TYPE_IP; m->m_def = "2130706433"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; m->m_title = "data feed server port"; m->m_desc = "The port of the Gigablast data feed server to retrieve " "customer account information from."; m->m_cgi = "dfport"; m->m_off = (char *)&cr.m_dfAcctPort - x; m->m_type = TYPE_LONG; m->m_def = "8040"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m++; /* m->m_title = "data feed server collection"; m->m_desc = "The collection on the Gigablast data feed server to " "retrieve customer account information from."; m->m_cgi = "dfcoll"; m->m_off = (char *)&cr.m_dfAcctColl - x; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN; m->m_def = "customers"; m->m_group = 0; m++; */ // // not sure cols=x goes here or not // /* m->m_title = "Number Of Columns(1-6)"; m->m_desc = "How many columns results should be shown in. (1-6)"; m->m_cgi = "cols"; m->m_smin = 1; m->m_smax = 6; m->m_off = (char *)&cr.m_numCols - x; m->m_soff = (char *)&si.m_numCols - y; m->m_type = TYPE_LONG; m->m_def = "1"; m->m_group = 0; m->m_sparm = 1; m++; */ // // Gets the screen width // /* m->m_title = "Screen Width"; m->m_desc = "screen size of browser window"; m->m_cgi = "ws"; m->m_smin = 600; m->m_off = (char *)&cr.m_screenWidth - x; m->m_soff = (char *)&si.m_screenWidth - y; m->m_type = TYPE_LONG; m->m_def = "1100"; m->m_group = 0; m->m_sparm = 1; m++; */ /* m->m_title = "collection hostname"; m->m_desc = "Hostname that will default to this collection. Blank" " for none or default collection."; m->m_cgi = "chstn"; m->m_off = (char *)cr.m_collectionHostname - x; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_def = ""; m++; m->m_title = "collection hostname (1)"; m->m_desc = "Hostname that will default to this collection. Blank" " for none or default collection."; m->m_cgi = "chstna"; m->m_off = (char *)cr.m_collectionHostname1 - x; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_def = ""; m->m_group = 0; m++; m->m_title = "collection hostname (2)"; m->m_desc = "Hostname that will default to this collection. Blank" " for none or default collection."; m->m_cgi = "chstnb"; m->m_off = (char *)cr.m_collectionHostname2 - x; m->m_type = TYPE_STRING; m->m_size = MAX_URL_LEN; m->m_def = ""; m->m_group = 0; m++; */ m->m_title = "home page"; static SafeBuf s_tmpBuf; s_tmpBuf.setLabel("stmpb1"); s_tmpBuf.safePrintf ( "Html to display for the home page. " "Leave empty for default home page. " "Use %%N for total " "number of pages indexed. Use %%n for number of " "pages indexed " "for the current collection. " //"Use %%H so Gigablast knows where to insert " //"the hidden form input tags, which must be there. " "Use %%c to insert the current collection name. " //"Use %T to display the standard footer. " "Use %%q to display the query in " "a text box. " "Use %%t to display the directory TOP. " "Example to paste into textbox: " "
" ); s_tmpBuf.htmlEncode ( "" "My Gigablast Search Engine" "" "" "

" "
" "" "" "

" "My Search Engine" "

" // "


" // "web " // "     " // "directory " // "     " // "advanced search " // "     " // "" // "add url" // "

" "
" "" "" " " "" "
" "
" "
" "Searching the %c collection of %n " "documents." "
" "
" "") ; s_tmpBuf.safePrintf("
"); m->m_desc = s_tmpBuf.getBufStart(); m->m_xml = "homePageHtml"; m->m_cgi = "hp"; m->m_off = (char *)&cr.m_htmlRoot - x; //m->m_plen = (char *)&cr.m_htmlRootLen - x; // length of string m->m_type = TYPE_SAFEBUF;//STRINGBOX; //m->m_size = MAX_HTML_LEN + 1; m->m_def = ""; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_flags = PF_TEXTAREA | PF_CLONE; m++; m->m_title = "html head"; static SafeBuf s_tmpBuf2; s_tmpBuf2.setLabel("stmpb2"); s_tmpBuf2.safePrintf("Html to display before the search results. "); char *fff = "Leave empty for default. " "Convenient " "for changing colors and displaying logos. Use " "the variable, " "%q, to represent the query to display in a " "text box. " "Use %e to print the url encoded query. " //"Use %e to print the page encoding. " // i guess this is out for now //"Use %D to " //"print a drop down " //"menu for the number of search results to return. " "Use %S " "to print sort by date or relevance link. Use " "%L to " "display the logo. Use %R to display radio " "buttons for site " "search. Use %F to begin the form. and use %H to " "insert " "hidden text " "boxes of parameters like the current search result " "page number. " "BOTH %F and %H are necessary for the html head, but do " "not duplicate them in the html tail. " "Use %f to display " "the family filter radio buttons. " // take this out for now //"Directory: Use %s to display the directory " //"search type options. " //"Use %l to specify the " //"location of " //"dir=rtl in the body tag for RTL pages. " //"Use %where and %when to substitute the where " //"and when of " //"the query. " //"These values may be set based on the cookie " //"if " //"none was explicitly given. " //"IMPORTANT: In the xml configuration file, " //"this html " //"must be encoded (less thans mapped to <, " //"etc.)."; "Example to paste into textbox:
"; s_tmpBuf2.safeStrcpy(fff); s_tmpBuf2.htmlEncode( "\n" "\n" "\n" "My Gigablast Search Results\n" "\n" "\n" "\n" //"
\n" // . %F prints the tag // . method will be GET or POST depending on the size of the // input data. MSIE can't handle sending large GETs requests // that are more than like 1k or so, which happens a lot with // our CTS technology (the sites= cgi parm can be very large) "%F" "\n" "\n" "\n" "\n" "\n" "
" // this prints the Logo "%L" //"" //"\"Gigablast" //"" "\n" "\n" " " // %D is the number of results drop down menu "\%D" "\n" "\n" // family filter // %R radio button for site(s) search "
%f %R\n" // directory search options // MDW: i guess this is out for now //"
%s
\n" // %H prints the hidden for vars. Print them *after* the input // text boxes, radio buttons, etc. so these hidden vars can be // overridden as they should be. "%H"); s_tmpBuf2.safePrintf("
"); m->m_desc = s_tmpBuf2.getBufStart(); m->m_xml = "htmlHead"; m->m_cgi = "hh"; m->m_off = (char *)&cr.m_htmlHead - x; m->m_type = TYPE_SAFEBUF;//STRINGBOX; m->m_def = ""; //m->m_sparm = 1; //m->m_soff = (char *)&si.m_htmlHead - y; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_flags = PF_TEXTAREA | PF_CLONE; m++; m->m_title = "html tail"; static SafeBuf s_tmpBuf3; s_tmpBuf3.setLabel("stmpb3"); s_tmpBuf3.safePrintf("Html to display after the search results. "); s_tmpBuf3.safeStrcpy(fff); s_tmpBuf3.htmlEncode ( "
\n" //"%F" "\n" "\n" //"" "\n" "\n" "
\n" // this old query is overriding a newer query above so // i commented out. mfd 6/2014 //"" //" %D\n" //"\n" //"" // family filter //"
%f %R\n" //"
" //"%R\n" //"
%s
\n" "Try your search on \n" "google  \n" "yahoo " " \n" //"alltheweb" //"\n" "" "dmoz  \n" //"" //"alta vista\n" //"teoma  \n" //"wisenut" //"\n" "\n"); s_tmpBuf3.safePrintf(""); m->m_desc = s_tmpBuf3.getBufStart(); m->m_xml = "htmlTail"; m->m_cgi = "ht"; m->m_off = (char *)&cr.m_htmlTail - x; m->m_type = TYPE_SAFEBUF;//STRINGBOX; m->m_def = ""; //m->m_sparm = 1; //m->m_soff = (char *)&si.m_htmlHead - y; m->m_page = PAGE_SEARCH; m->m_obj = OBJ_COLL; m->m_flags = PF_TEXTAREA | PF_CLONE; m++; /////////////////////////////////////////// // PAGE SPIDER CONTROLS /////////////////////////////////////////// // just a comment in the conf file m->m_desc = "All <, >, \" and # characters that are values for a field " "contained herein must be represented as " "<, >, " and # respectively."; m->m_type = TYPE_COMMENT; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "spidering enabled"; m->m_desc = "Controls just the spiders for this collection."; m->m_cgi = "cse"; m->m_off = (char *)&cr.m_spideringEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; // this linked list of colls is in Spider.cpp and used to only // poll the active spider colls for spidering. so if coll // gets paused/unpaused we have to update it. m->m_flags = PF_CLONE | PF_REBUILDACTIVELIST; m++; m->m_title = "site list"; m->m_xml = "siteList"; m->m_desc = "List of sites to spider, one per line. " "See example site list below. " "Gigablast uses the " "insitelist " "directive on " "the url filters " "page to make sure that the spider only indexes urls " "that match the site patterns you specify here, other than " "urls you add individually via the add urls or inject url " "tools. " "Limit list to 300MB. If you have a lot of INDIVIDUAL urls " "to add then consider using the addurl" " interface."; m->m_cgi = "sitelist"; m->m_off = (char *)&cr.m_siteListBuf - x; m->m_page = PAGE_SPIDER;// PAGE_SITES; m->m_obj = OBJ_COLL; m->m_type = TYPE_SAFEBUF; m->m_func = CommandUpdateSiteList; m->m_def = ""; // rebuild urlfilters now will nuke doledb and call updateSiteList() m->m_flags = PF_TEXTAREA | PF_REBUILDURLFILTERS | PF_CLONE; m++; m->m_title = "reset collection"; m->m_desc = "Remove all documents from the collection and turn " "spiders off."; m->m_cgi = "reset"; m->m_type = TYPE_CMD; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_func2 = CommandResetColl; m->m_cast = 1; m->m_flags = PF_HIDDEN; m++; m->m_title = "restart collection"; m->m_desc = "Remove all documents from the collection and re-add " "seed urls from site list."; m->m_cgi = "restart"; m->m_type = TYPE_CMD; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_func2 = CommandRestartColl; m->m_cast = 1; m++; /* m->m_title = "new spidering enabled"; m->m_desc = "When enabled the spider adds NEW " "pages to your index. "; m->m_cgi = "nse"; m->m_off = (char *)&cr.m_newSpideringEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "old spidering enabled"; m->m_desc = "When enabled the spider will re-visit " "and update pages that are already in your index."; m->m_cgi = "ose"; m->m_off = (char *)&cr.m_oldSpideringEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "new spider weight"; m->m_desc = "Weight time slices of new spiders in the priority " "page by this factor relative to the old spider queues."; m->m_cgi = "nsw"; m->m_off = (char *)&cr.m_newSpiderWeight - x; m->m_type = TYPE_FLOAT; m->m_def = "1.0"; m->m_group = 0; m++; */ m->m_title = "max spiders"; m->m_desc = "What is the maximum number of web " "pages the spider is allowed to download " "simultaneously PER HOST for THIS collection? The " "maximum number of spiders over all collections is " "controlled in the master controls."; m->m_cgi = "mns"; m->m_off = (char *)&cr.m_maxNumSpiders - x; m->m_type = TYPE_LONG; // make it the hard max so control is really in the master controls m->m_def = "300"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "spider delay in milliseconds"; m->m_desc = "make each spider wait this many milliseconds before " "getting the ip and downloading the page."; m->m_cgi = "sdms"; m->m_off = (char *)&cr.m_spiderDelayInMilliseconds - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "obey robots.txt"; m->m_xml = "useRobotstxt"; m->m_desc = "If this is true Gigablast will respect " "the robots.txt convention and rel no follow meta tags."; m->m_cgi = "obeyRobots"; m->m_off = (char *)&cr.m_useRobotsTxt - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "obey rel no follow links"; m->m_desc = "If this is true Gigablast will respect " "the rel no follow link attribute."; m->m_cgi = "obeyRelNoFollow"; m->m_off = (char *)&cr.m_obeyRelNoFollowLinks - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "max robots.txt cache age"; m->m_desc = "How many seconds to cache a robots.txt file for. " "86400 is 1 day. 0 means Gigablast will not read from the " "cache at all and will download the robots.txt before every " "page if robots.txt use is enabled above. However, if this is " "0 then Gigablast will still store robots.txt files in the " "cache."; m->m_cgi = "mrca"; m->m_off = (char *)&cr.m_maxRobotsCacheAge - x; m->m_type = TYPE_LONG; m->m_def = "86400"; // 24*60*60 = 1day m->m_units = "seconds"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "always use spider proxies"; m->m_desc = "If this is true Gigablast will ALWAYS use the proxies " "listed on the proxies " "page for " "spidering for " "this collection." //"regardless whether the proxies are enabled " //"on the proxies page." ; m->m_cgi = "useproxies"; m->m_off = (char *)&cr.m_forceUseFloaters - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "automatically use spider proxies"; m->m_desc = "Use the spider proxies listed on the proxies page " "if gb detects that " "a webserver is throttling the spiders. This way we can " "learn the webserver's spidering policy so that our spiders " "can be more polite. If no proxies are listed on the " "proxies page then this parameter will have no effect."; m->m_cgi = "automaticallyuseproxies"; m->m_off = (char *)&cr.m_automaticallyUseProxies - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "automatically back off"; m->m_desc = "Set the crawl delay to 5 seconds if gb detects " "that an IP is throttling or banning gigabot from crawling " "it. The crawl delay just applies to that IP. " "Such throttling will be logged."; m->m_cgi = "automaticallybackoff"; m->m_xml = "automaticallyBackOff"; m->m_off = (char *)&cr.m_automaticallyBackOff - x; m->m_type = TYPE_BOOL; // a lot of pages have recaptcha links but they have valid content // so leave this off for now... they have it in a hidden div which // popups to email the article link or whatever to someone. m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "use time axis"; m->m_desc = "If this is true Gigablast will index the same " "url multiple times if its content varies over time, " "rather than overwriting the older version in the index. " "Useful for archive web pages as they change over time."; m->m_cgi = "usetimeaxis"; m->m_off = (char *)&cr.m_useTimeAxis - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "index warc or arc files"; m->m_desc = "If this is true Gigablast will index .warc and .arc " "files by injecting the pages contained in them as if they " "were spidered with the content in the .warc or .arc file. " "The spidered time will be taken from the archive file " "as well."; m->m_cgi = "indexwarcs"; m->m_off = (char *)&cr.m_indexWarcs - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; /* m->m_title = "add url enabled"; m->m_desc = "If this is enabled others can add " "web pages to your index via the add url page."; m->m_cgi = "aue"; m->m_off = (char *)&cr.m_addUrlEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m++; */ m->m_title = "daily merge time"; m->m_desc = "Do a tight merge on posdb and titledb at this time " "every day. This is expressed in MINUTES past midnight UTC. " "UTC is 5 hours ahead " "of EST and 7 hours ahead of MST. Leave this as -1 to " "NOT perform a daily merge. To merge at midnight EST use " "60*5=300 and midnight MST use 60*7=420."; m->m_cgi = "dmt"; m->m_off = (char *)&cr.m_dailyMergeTrigger - x; m->m_type = TYPE_LONG; m->m_def = "-1"; m->m_units = "minutes"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "daily merge days"; m->m_desc = "Comma separated list of days to merge on. Use " "0 for Sunday, 1 for Monday, ... 6 for Saturday. Leaving " "this parameter empty or without any numbers will make the " "daily merge happen every day"; m->m_cgi = "dmdl"; m->m_off = (char *)&cr.m_dailyMergeDOWList - x; m->m_type = TYPE_STRING; m->m_size = 48; // make sunday the default m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "daily merge last started"; m->m_desc = "When the daily merge was last kicked off. Expressed in " "UTC in seconds since the epoch."; m->m_cgi = "dmls"; m->m_off = (char *)&cr.m_dailyMergeStarted - x; m->m_type = TYPE_LONG_CONST; m->m_def = "-1"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_NOAPI; m++; /* m->m_title = "use datedb"; m->m_desc = "Index documents for generating results sorted by date " "or constrained by date range. Only documents indexed while " "this is enabled will be returned for date-related searches."; m->m_cgi = "ud"; m->m_off = (char *)&cr.m_useDatedb - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "age cutoff for datedb"; m->m_desc = "Do not index pubdates into datedb that are more " "than this many days old. Use -1 for no limit. A value " "of zero essentially turns off datedb. Pre-existing pubdates " "in datedb that fail to meet this constraint WILL BE " "COMPLETELY ERASED when datedb is merged."; m->m_cgi = "dbc"; m->m_off = (char *)&cr.m_datedbCutoff - x; m->m_type = TYPE_LONG; m->m_def = "-1"; m->m_units = "days"; m++; m->m_title = "datedb default timezone"; m->m_desc = "Default timezone to use when none specified on parsed " "time. Use offset from GMT, i.e 0400 (AMT) or -0700 (MST)"; m->m_cgi = "ddbdt"; m->m_off = (char *)&cr.m_datedbDefaultTimezone - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m++; */ //m->m_title = "days before now to index"; //m->m_desc = "Only index page if the datedb date was found to be " // "within this many days of the current time. Use 0 to index " // "all dates. Parm is float for fine control."; //m->m_cgi = "ddbdbn"; //m->m_off = (char *)&cr.m_datedbDaysBeforeNow - x; //m->m_type = TYPE_FLOAT; //m->m_def = "0"; //m->m_group = 0; //m++; m->m_title = "turing test enabled"; m->m_desc = "If this is true, users will have to " "pass a simple Turing test to add a url. This prevents " "automated url submission."; m->m_cgi = "dtt"; m->m_off = (char *)&cr.m_doTuringTest - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "max add urls"; m->m_desc = "Maximum number of urls that can be " "submitted via the addurl interface, per IP domain, per " "24 hour period. A value less than or equal to zero " "implies no limit."; m->m_cgi = "mau"; m->m_off = (char *)&cr.m_maxAddUrlsPerIpDomPerDay - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; // use url filters harvest links parm for this now /* m->m_title = "spider links"; m->m_desc = "If this is false, the spider will not " "harvest links from web pages it visits. Links that it does " "harvest will be attempted to be indexed at a later time. "; m->m_cgi = "sl"; m->m_off = (char *)&cr.m_spiderLinks - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m++; */ /* MDW: use the "onsite" directive in the url filters page now... m->m_title = "only spider links from same host"; m->m_desc = "If this is true the spider will only harvest links " "to pages that are contained on the same host as the page " "that is being spidered. " "Example: When spidering a page from " "www.gigablast.com, only links to pages that are from " "www.gigablast.com would " "be harvested, if this switch were enabled. This allows you " "to seed the spider with URLs from a specific set of hosts " "and ensure that only links to pages that are from those " "hosts are harvested."; m->m_cgi = "slsh"; m->m_off = (char *)&cr.m_sameHostLinks - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; */ m->m_title = "do not re-add old outlinks more than this many days"; m->m_desc = "If less than this many days have elapsed since the " "last time we added the outlinks to spiderdb, do not re-add " "them to spiderdb. Saves resources."; m->m_cgi = "slrf"; m->m_off = (char *)&cr.m_outlinksRecycleFrequencyDays - x; m->m_type = TYPE_FLOAT; m->m_def = "30"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* m->m_title = "spider links by priority"; m->m_desc = "Specify priorities for which links should be spidered. " "If the spider links option above is " "disabled then these setting will have no effect."; m->m_cgi = "slp"; m->m_xml = "spiderLinksByPriority"; m->m_off = (char *)&cr.m_spiderLinksByPriority - x; m->m_type = TYPE_PRIORITY_BOXES; // array of numbered (0-(MAX_SPIDER_PRIORITIES-1)) checkboxes m->m_fixed = MAX_SPIDER_PRIORITIES; m->m_def = "1"; // default for each one is on m->m_group = 0; m++; */ /* m->m_title = "min link priority"; m->m_desc = "Only add links to the spider " "queue if their spider priority is this or higher. " "This can make the spider process more efficient " "since a lot of disk seeks are used when adding " "links."; m->m_cgi = "mlp"; m->m_off = (char *)&cr.m_minLinkPriority - x; m->m_type = TYPE_PRIORITY; m->m_def = "0"; m->m_group = 0; m++; */ /* m->m_title = "maximum hops from parent page"; m->m_desc = "Only index pages that are within a particular number " "of hops from the parent page given in Page Add Url. -1 means " "that max hops is infinite."; m->m_cgi = "mnh"; m->m_off = (char *)&cr.m_maxNumHops - x; m->m_type = TYPE_CHAR2; m->m_def = "-1"; m->m_group = 0; m++;*/ m->m_title = "scraping enabled procog"; m->m_desc = "Do searches for queries in this hosts part of the " "query log."; m->m_cgi = "scrapepc"; m->m_off = (char *)&cr.m_scrapingEnabledProCog - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "scraping enabled web"; m->m_desc = "Perform random searches on googles news search engine " "to add sites with ingoogle tags into tagdb."; m->m_cgi = "scrapeweb"; m->m_off = (char *)&cr.m_scrapingEnabledWeb - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "scraping enabled news"; m->m_desc = "Perform random searches on googles news search engine " "to add sites with news and goognews and ingoogle " "tags into tagdb."; m->m_cgi = "scrapenews"; m->m_off = (char *)&cr.m_scrapingEnabledNews - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "scraping enabled blogs"; m->m_desc = "Perform random searches on googles news search engine " "to add sites with blogs and googblogs and ingoogle " "tags into tagdb."; m->m_cgi = "scrapeblogs"; m->m_off = (char *)&cr.m_scrapingEnabledBlogs - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* m->m_title = "subsite detection enabled"; m->m_desc = "Add the \"sitepathdepth\" to tagdb if a hostname " "is determined to have subsites at a particular depth."; m->m_cgi = "ssd"; m->m_off = (char *)&cr.m_subsiteDetectionEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ m->m_title = "deduping enabled"; m->m_desc = "When enabled, the spider will " "discard web pages which are identical to other web pages " "that are already in the index. "//AND that are from the same " //"hostname. //"An example of a hostname is www1.ibm.com. " "However, root urls, urls that have no path, are never " "discarded. It most likely has to hit disk to do these " "checks so it does cause some slow down. Only use it if you " "need it."; m->m_cgi = "de"; m->m_off = (char *)&cr.m_dedupingEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "deduping enabled for www"; m->m_desc = "When enabled, the spider will " "discard web pages which, when a www is prepended to the " "page's url, result in a url already in the index."; m->m_cgi = "dew"; m->m_off = (char *)&cr.m_dupCheckWWW - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "detect custom error pages"; m->m_desc = "Detect and do not index pages which have a 200 status" " code, but are likely to be error pages."; m->m_cgi = "dcep"; m->m_off = (char *)&cr.m_detectCustomErrorPages - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "delete 404s"; m->m_desc = "Should pages be removed from the index if they are no " "longer accessible on the web?"; m->m_cgi = "dnf"; m->m_off = (char *)&cr.m_delete404s - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_HIDDEN; m++; m->m_title = "delete timed out docs"; m->m_desc = "Should documents be deleted from the index " "if they have been retried them enough times and the " "last received error is a time out? " "If your internet connection is flaky you may say " "no here to ensure you do not lose important docs."; m->m_cgi = "dtod"; m->m_off = (char *)&cr.m_deleteTimeouts - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "use simplified redirects"; m->m_desc = "If this is true, the spider, when a url redirects " "to a \"simpler\" url, will add that simpler url into " "the spider queue and abandon the spidering of the current " "url."; m->m_cgi = "usr"; m->m_off = (char *)&cr.m_useSimplifiedRedirects - x; m->m_type = TYPE_BOOL; // turn off for now. spider time deduping should help any issues // by disabling this. m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "use canonical redirects"; m->m_desc = "If page has a on it then treat it " "as a redirect, add it to spiderdb for spidering " "and abandon the indexing of the current url."; m->m_cgi = "ucr"; m->m_off = (char *)&cr.m_useCanonicalRedirects - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m->m_group = 0; m++; m->m_title = "use ifModifiedSince"; m->m_desc = "If this is true, the spider, when " "updating a web page that is already in the index, will " "not even download the whole page if it hasn't been " "updated since the last time Gigablast spidered it. " "This is primarily a bandwidth saving feature. It relies on " "the remote webserver's returned Last-Modified-Since field " "being accurate."; m->m_cgi = "uims"; m->m_off = (char *)&cr.m_useIfModifiedSince - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "build similarity vector from content only"; m->m_desc = "If this is true, the spider, when checking the page " "if it has changed enough to reindex or update the " "published date, it will build the vector only from " "the content located on that page."; m->m_cgi = "bvfc"; m->m_off = (char *)&cr.m_buildVecFromCont - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "use content similarity to index publish date"; m->m_desc = "This requires build similarity from content only to be " "on. This indexes the publish date (only if the content " "has changed enough) to be between the last two spider " "dates."; m->m_cgi = "uspd"; m->m_off = (char *)&cr.m_useSimilarityPublishDate - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "max percentage similar to update publish date"; m->m_desc = "This requires build similarity from content only and " "use content similarity to index publish date to be " "on. This percentage is the maximum similarity that can " "exist between an old document and new before the publish " "date will be updated."; m->m_cgi = "mpspd"; m->m_off = (char *)&cr.m_maxPercentSimilarPublishDate - x; m->m_type = TYPE_LONG; m->m_def = "80"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; // use url filters for this. this is a crawlbot parm really. /* m->m_title = "restrict domain"; m->m_desc = "Keep crawler on same domain as seed urls?"; m->m_cgi = "restrictDomain"; m->m_off = (char *)&cr.m_restrictDomain - x; m->m_type = TYPE_BOOL; m->m_def = "1"; // we need to save this it is a diffbot parm m->m_flags = PF_HIDDEN | PF_DIFFBOT;// | PF_NOSAVE; m++; */ m->m_title = "do url sporn checking"; m->m_desc = "If this is true and the spider finds " "lewd words in the hostname of a url it will throw " "that url away. It will also throw away urls that have 5 or " "more hyphens in their hostname."; m->m_cgi = "dusc"; m->m_off = (char *)&cr.m_doUrlSpamCheck - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* m->m_title = "hours before adding unspiderable url to spiderdb"; m->m_desc = "Hours to wait after trying to add an unspiderable url " "to spiderdb again."; m->m_cgi = "dwma"; m->m_off = (char *)&cr.m_deadWaitMaxAge - x; m->m_type = TYPE_LONG; m->m_def = "24"; m++; */ //m->m_title = "link text anomaly threshold"; //m->m_desc = "Prevent pages from link voting for " // "another page if its link text has a " // "word which doesn't occur in at least this " // "many other link texts. (set to 1 to disable)"; //m->m_cgi = "ltat"; //m->m_off = (char *)&cr.m_linkTextAnomalyThresh - x; //m->m_type = TYPE_LONG; //m->m_def = "2"; //m++; /* m->m_title = "enforce domain quotas on new docs"; m->m_desc = "If this is true then new documents will be removed " "from the index if the quota for their domain " "has been breeched."; m->m_cgi = "enq"; m->m_off = (char *)&cr.m_enforceNewQuotas - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "enforce domain quotas on indexed docs"; m->m_desc = "If this is true then indexed documents will be removed " "from the index if the quota for their domain has been " "breeched."; m->m_cgi = "eoq"; m->m_off = (char *)&cr.m_enforceOldQuotas - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "use exact quotas"; m->m_desc = "Does not use approximations so will do more disk seeks " "and may impact indexing performance significantly."; m->m_cgi = "ueq"; m->m_off = (char *)&cr.m_exactQuotas - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "restrict indexdb for spidering"; m->m_desc = "If this is true then only the root indexb file is " "searched for linkers. Saves on disk seeks, " "but may use older versions of indexed web pages."; m->m_cgi = "ris"; m->m_off = (char *)&cr.m_restrictIndexdbForSpider - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ /* m->m_title = "indexdb max total files to merge"; m->m_desc = "Do not merge more than this many files during a single " "merge operation. Merge does not scale well to numbers above " "50 or so."; m->m_cgi = "mttftm"; m->m_off = (char *)&cr.m_indexdbMinTotalFilesToMerge - x; m->m_def = "50"; //m->m_max = 100; m->m_type = TYPE_LONG; m++; m->m_title = "indexdb min files needed to trigger merge"; m->m_desc = "Merge is triggered when this many indexdb data files " "are on disk."; m->m_cgi = "miftm"; m->m_off = (char *)&cr.m_indexdbMinFilesToMerge - x; m->m_def = "6"; // default to high query performance, not spider m->m_type = TYPE_LONG; m->m_group = 0; m++; m->m_title = "datedb min files needed to trigger to merge"; m->m_desc = "Merge is triggered when this many datedb data files " "are on disk."; m->m_cgi = "mdftm"; m->m_off = (char *)&cr.m_datedbMinFilesToMerge - x; m->m_def = "5"; m->m_type = TYPE_LONG; m->m_group = 0; m++; m->m_title = "spiderdb min files needed to trigger to merge"; m->m_desc = "Merge is triggered when this many spiderdb data files " "are on disk."; m->m_cgi = "msftm"; m->m_off = (char *)&cr.m_spiderdbMinFilesToMerge - x; m->m_def = "2"; m->m_type = TYPE_LONG; m->m_group = 0; m++; m->m_title = "checksumdb min files needed to trigger to merge"; m->m_desc = "Merge is triggered when this many checksumdb data files " "are on disk."; m->m_cgi = "mcftm"; m->m_off = (char *)&cr.m_checksumdbMinFilesToMerge - x; m->m_def = "2"; m->m_type = TYPE_LONG; m->m_group = 0; m++; m->m_title = "clusterdb min files needed to trigger to merge"; m->m_desc = "Merge is triggered when this many clusterdb data files " "are on disk."; m->m_cgi = "mclftm"; m->m_off = (char *)&cr.m_clusterdbMinFilesToMerge - x; m->m_def = "2"; m->m_type = TYPE_LONG; m->m_group = 0; m++; */ m->m_title = "linkdb min files needed to trigger to merge"; m->m_desc = "Merge is triggered when this many linkdb data files " "are on disk. Raise this when initially growing an index " "in order to keep merging down."; m->m_cgi = "mlkftm"; m->m_off = (char *)&cr.m_linkdbMinFilesToMerge - x; m->m_def = "6"; m->m_type = TYPE_LONG; m->m_group = 0; m->m_flags = PF_CLONE;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "tagdb min files to merge"; m->m_desc = "Merge is triggered when this many linkdb data files " "are on disk."; m->m_cgi = "mtftgm"; m->m_off = (char *)&cr.m_tagdbMinFilesToMerge - x; m->m_def = "2"; m->m_type = TYPE_LONG; m->m_group = 0; m->m_flags = PF_CLONE;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; // this is overridden by collection m->m_title = "titledb min files needed to trigger to merge"; m->m_desc = "Merge is triggered when this many titledb data files " "are on disk."; m->m_cgi = "mtftm"; m->m_off = (char *)&cr.m_titledbMinFilesToMerge - x; m->m_def = "6"; m->m_type = TYPE_LONG; //m->m_save = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; //m->m_title = "sectiondb min files to merge"; //m->m_desc ="Merge is triggered when this many sectiondb data files " // "are on disk."; //m->m_cgi = "mscftm"; //m->m_off = (char *)&cr.m_sectiondbMinFilesToMerge - x; //m->m_def = "4"; //m->m_type = TYPE_LONG; //m->m_group = 0; //m++; m->m_title = "posdb min files needed to trigger to merge"; m->m_desc = "Merge is triggered when this many posdb data files " "are on disk. Raise this while doing massive injections " "and not doing much querying. Then when done injecting " "keep this low to make queries fast."; m->m_cgi = "mpftm"; m->m_off = (char *)&cr.m_posdbMinFilesToMerge - x; m->m_def = "6"; m->m_type = TYPE_LONG; m->m_group = 0; m->m_flags = PF_CLONE;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "recycle content"; m->m_desc = "Rather than downloading the content again when " "indexing old urls, use the stored content. Useful for " "reindexing documents under a different ruleset or for " "rebuilding an index. You usually " "should turn off the 'use robots.txt' switch. " "And turn on the 'use old ips' and " "'recycle link votes' switches for speed. If rebuilding an " "index then you should turn off the 'only index changes' " "switches."; m->m_cgi = "rc"; m->m_off = (char *)&cr.m_recycleContent - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "enable link voting"; m->m_desc = "If this is true Gigablast will " "index hyper-link text and use hyper-link " "structures to boost the quality of indexed documents. " "You can disable this when doing a ton of injections to " "keep things fast. Then do a posdb (index) rebuild " "after re-enabling this when you are done injecting. Or " "if you simply do not want link voting this will speed up" "your injections and spidering a bit."; m->m_cgi = "glt"; m->m_off = (char *)&cr.m_getLinkInfo - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "compute inlinks to sites"; m->m_desc = "If this is true Gigablast will " "compute the number of site inlinks for the sites it " "indexes. This is a measure of the sites popularity and is " "used for ranking and some times spidering prioritzation. " "It will cache the site information in tagdb. " "The greater the number of inlinks, the longer the cached " "time, because the site is considered more stable. If this " "is NOT true then Gigablast will use the included file, " "sitelinks.txt, which stores the site inlinks of millions " "of the most popular sites. This is the fastest way. If you " "notice a lot of getting link info requests in the " "sockets table you may want to disable this " "parm."; m->m_cgi = "csni"; m->m_off = (char *)&cr.m_computeSiteNumInlinks - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "do link spam checking"; m->m_desc = "If this is true, do not allow spammy inlinks to vote. " "This check is " "too aggressive for some collections, i.e. it " "does not allow pages with cgi in their urls to vote."; m->m_cgi = "dlsc"; m->m_off = (char *)&cr.m_doLinkSpamCheck - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "restrict link voting by ip"; m->m_desc = "If this is true Gigablast will " "only allow one vote per the top 2 significant bytes " "of the IP address. Otherwise, multiple pages " "from the same top IP can contribute to the link text and " "link-based quality ratings of a particular URL. " "Furthermore, no votes will be accepted from IPs that have " "the same top 2 significant bytes as the IP of the page " "being indexed."; m->m_cgi = "ovpid"; m->m_off = (char *)&cr.m_oneVotePerIpDom - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "use new link algo"; m->m_desc = "Use the links: termlists instead of link:. Also " "allows pages linking from the same domain or IP to all " "count as a single link from a different IP. This is also " "required for incorporating RSS and Atom feed information " "when indexing a document."; m->m_cgi = "na"; m->m_off = (char *)&cr.m_newAlgo - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* m->m_title = "recycle link votes"; m->m_desc = "If this is true Gigablast will " "use the old links and link text when re-indexing old urls " "and not do any link voting when indexing new urls."; m->m_cgi = "rv"; m->m_off = (char *)&cr.m_recycleVotes - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; */ m->m_title = "update link info frequency"; m->m_desc = "How often should Gigablast recompute the " "link info for a url. " "Also applies to getting the quality of a site " "or root url, which is based on the link info. " "In days. Can use decimals. 0 means to update " "the link info every time the url's content is re-indexed. " "If the content is not reindexed because it is unchanged " "then the link info will not be updated. When getting the " "link info or quality of the root url from an " "external cluster, Gigablast will tell the external cluster " "to recompute it if its age is this or higher."; m->m_cgi = "uvf"; m->m_off = (char *)&cr.m_updateVotesFreq - x; m->m_type = TYPE_FLOAT; m->m_def = "60.000000"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; /* m->m_title = "recycle imported link info"; m->m_desc = "If true, we ALWAYS recycle the imported link info and " "NEVER recompute it again. Otherwise, recompute it when we " "recompute the local link info."; m->m_cgi = "rili"; m->m_off = (char *)&cr.m_recycleLinkInfo2 - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; */ /* m->m_title = "use imported link info for quality"; m->m_desc = "If true, we will use the imported link info to " "help us determine the quality of the page we are indexing."; m->m_cgi = "uifq"; m->m_off = (char *)&cr.m_useLinkInfo2ForQuality - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; */ // this can hurt us too much if mis-assigned, remove it /* m->m_title = "restrict link voting to roots"; m->m_desc = "If this is true Gigablast will " "not perform link analysis on urls that are not " "root urls."; m->m_cgi = "rvr"; m->m_off = (char *)&cr.m_restrictVotesToRoots - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; */ /* m->m_title = "index link text"; m->m_desc = "If this is true Gigablast will " "index both incoming and outgoing link text for the " "appropriate documents, depending on url filters and " "site rules, under the gbinlinktext: and gboutlinktext: " "fields. Generally, you want this disabled, it was for " "a client."; m->m_cgi = "ilt"; m->m_off = (char *)&cr.m_indexLinkText - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; */ /* m->m_title = "index incoming link text"; m->m_desc = "If this is false no incoming link text is indexed."; m->m_cgi = "iilt"; m->m_off = (char *)&cr.m_indexLinkText - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; */ m->m_title = "index inlink neighborhoods"; m->m_desc = "If this is true Gigablast will " "index the plain text surrounding the hyper-link text. The " "score will be x times that of the hyper-link text, where x " "is the scalar below."; m->m_cgi = "iin"; m->m_off = (char *)&cr.m_indexInlinkNeighborhoods - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* // this is now hard-coded in XmlNode.cpp, currently .8 m->m_title = "inlink neighborhoods score scalar"; m->m_desc = "Gigablast can " "index the plain text surrounding the hyper-link text. The " "score will be x times that of the hyper-link text, where x " "is this number."; m->m_cgi = "inss"; m->m_off = (char *)&cr.m_inlinkNeighborhoodsScoreScalar - x; m->m_type = TYPE_FLOAT; m->m_def = ".20"; m->m_group = 0; m++; */ /* m->m_title = "break web rings"; m->m_desc = "If this is true Gigablast will " "attempt to detect link spamming rings and decrease " "their influence on the link text for a URL."; m->m_cgi = "bwr"; m->m_off = (char *)&cr.m_breakWebRings - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; */ /* m->m_title = "break log spam"; m->m_desc = "If this is true Gigablast will attempt to detect " "dynamically generated pages and remove their voting power. " "Additionally, pages over 100k will not be have their " "outgoing links counted. Pages that have a form which POSTS " "to a cgi page will not be considered either."; m->m_cgi = "bls"; m->m_off = (char *)&cr.m_breakLogSpam - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m++; */ m->m_title = "tagdb collection name"; m->m_desc = "Sometimes you want the spiders to use the tagdb of " "another collection, like the main collection. " "If this is empty it defaults to the current collection."; m->m_cgi = "tdbc"; m->m_off = (char *)&cr.m_tagdbColl - x; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN+1; m->m_def = ""; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "catdb lookups enabled"; m->m_desc = "Spiders will look to see if the current page is in " "catdb. If it is, all Directory information for that page " "will be indexed with it."; m->m_cgi = "cdbe"; m->m_off = (char *)&cr.m_catdbEnabled - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "recycle catdb info"; m->m_desc = "Rather than requesting new info from DMOZ, like " "titles and topic ids, grab it from old record. Increases " "performance if you are seeing a lot of " "\"getting catdb record\" entries in the spider queues."; m->m_cgi = "rci"; m->m_off = (char *)&cr.m_recycleCatdb - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "allow banning of pages in catdb"; m->m_desc = "If this is 'NO' then pages that are in catdb, " "but banned from tagdb or the url filters page, can not " "be banned."; m->m_cgi = "abpc"; m->m_off = (char *)&cr.m_catdbPagesCanBeBanned - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "override spider errors for catdb"; m->m_desc = "Ignore and skip spider errors if the spidered site" " is found in Catdb (DMOZ)."; m->m_cgi = "catose"; m->m_off = (char *)&cr.m_overrideSpiderErrorsForCatdb - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; //m->m_title = "only spider root urls"; //m->m_desc = "Only spider urls that are roots."; //m->m_cgi = "osru"; //m->m_off = (char *)&cr.m_onlySpiderRoots - x; //m->m_type = TYPE_BOOL; //m->m_def = "0"; //m++; m->m_title = "allow asian docs"; m->m_desc = "If this is disabled the spider " "will not allow any docs from the gb2312 charset " "into the index."; m->m_cgi = "aad"; m->m_off = (char *)&cr.m_allowAsianDocs - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "allow adult docs"; m->m_desc = "If this is disabled the spider " "will not allow any docs which contain adult content " "into the index (overrides tagdb)."; m->m_cgi = "aprnd"; m->m_off = (char *)&cr.m_allowAdultDocs - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0 ; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "allow xml docs"; m->m_desc = "If this is disabled the spider " "will not allow any xml " "into the index."; m->m_cgi = "axd"; m->m_off = (char *)&cr.m_allowXmlDocs - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "do serp detection"; m->m_desc = "If this is enabled the spider " "will not allow any docs which are determined to " "be serps."; m->m_cgi = "dsd"; m->m_off = (char *)&cr.m_doSerpDetection - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "do IP lookup"; m->m_desc = "If this is disabled and the proxy " "IP below is not zero then Gigablast will assume " "all spidered URLs have an IP address of 1.2.3.4."; m->m_cgi = "dil"; m->m_off = (char *)&cr.m_doIpLookups - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "use old IPs"; m->m_desc = "Should the stored IP " "of documents we are reindexing be used? Useful for " "pages banned by IP address and then reindexed with " "the reindexer tool."; m->m_cgi = "useOldIps"; m->m_off = (char *)&cr.m_useOldIps - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "remove banned pages"; m->m_desc = "Remove banned pages from the index. Pages can be " "banned using tagdb or the Url Filters table."; m->m_cgi = "rbp"; m->m_off = (char *)&cr.m_removeBannedPages - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* m->m_title = "ban domains of urls banned by IP"; m->m_desc = "Most urls are banned by IP " "address. But owners often will keep the same " "domains and change their IP address. So when " "banning a url that was banned by IP, should its domain " "be banned too? (obsolete)"; m->m_cgi = "banDomains"; m->m_off = (char *)&cr.m_banDomains - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ m->m_title = "allow HTTPS pages using SSL"; m->m_desc = "If this is true, spiders will read " "HTTPS pages using SSL Protocols."; m->m_cgi = "ahttps"; m->m_off = (char *)&cr.m_allowHttps - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* m->m_title = "require dollar sign"; m->m_desc = "If this is YES, then do not allow document to be " "indexed if they do not contain a dollar sign ($), but the " "links will still be harvested. Used for building shopping " "index."; m->m_cgi = "nds"; m->m_off = (char *)&cr.m_needDollarSign - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ /* m->m_title = "require numbers in url"; m->m_desc = "If this is YES, then do not allow document to be " "indexed if they do not have two back-to-back digits in the " "path of the url, but the links will still be harvested. Used " "to build a news index."; m->m_cgi = "nniu"; m->m_off = (char *)&cr.m_needNumbersInUrl - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "index news topics"; m->m_desc = "If this is YES, Gigablast will attempt to categorize " "every page as being in particular news categories like " "sports, business, etc. and will be searchable by doing a " "query like \"newstopic:sports."; m->m_cgi = "int"; m->m_off = (char *)&cr.m_getNewsTopic - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ m->m_title = "follow RSS links"; m->m_desc = "If an item on a page has an RSS feed link, add the " "RSS link to the spider queue and index the RSS pages " "instead of the current page."; m->m_cgi = "frss"; m->m_off = (char *)&cr.m_followRSSLinks - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "only index articles from RSS feeds"; m->m_desc = "Only index pages that were linked to by an RSS feed. " "Follow RSS Links must be enabled (above)."; m->m_cgi = "orss"; m->m_off = (char *)&cr.m_onlyIndexRSS - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "max text doc length"; m->m_desc = "Gigablast will not download, index or " "store more than this many bytes of an HTML or text " "document. XML is NOT considered to be HTML or text, use " "the rule below to control the maximum length of an XML " "document. " "Use -1 for no max."; m->m_cgi = "mtdl"; m->m_off = (char *)&cr.m_maxTextDocLen - x; m->m_type = TYPE_LONG; m->m_def = "1048576"; // 1MB m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE|PF_API; m++; m->m_title = "max other doc length"; m->m_desc = "Gigablast will not download, index or " "store more than this many bytes of a non-html, non-text " "document. XML documents will be restricted to this " "length. " "Use -1 for no max."; m->m_cgi = "modl"; m->m_off = (char *)&cr.m_maxOtherDocLen - x; m->m_type = TYPE_LONG; m->m_def = "1048576"; // 1MB m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE|PF_API; m++; //m->m_title = "indexdb truncation limit"; //m->m_cgi = "itl"; //m->m_desc = "How many documents per term? Keep this very high."; //m->m_off = (char *)&cr.m_indexdbTruncationLimit - x; //m->m_def = "50000000"; //m->m_type = TYPE_LONG; //m->m_min = MIN_TRUNC; // from Indexdb.h //m++; m->m_title = "apply filter to text pages"; m->m_desc = "If this is false then the filter " "will not be used on html or text pages."; m->m_cgi = "aft"; m->m_off = (char *)&cr.m_applyFilterToText - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "filter name"; m->m_desc = "Program to spawn to filter all HTTP " "replies the spider receives. Leave blank for none."; m->m_cgi = "filter"; m->m_def = ""; m->m_off = (char *)&cr.m_filter - x; m->m_type = TYPE_STRING; m->m_size = MAX_FILTER_LEN+1; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "filter timeout"; m->m_desc = "Kill filter shell after this many seconds. Assume it " "stalled permanently."; m->m_cgi = "fto"; m->m_def = "40"; m->m_off = (char *)&cr.m_filterTimeout - x; m->m_type = TYPE_LONG; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; /* m->m_title = "proxy ip"; m->m_desc = "Retrieve pages from the proxy at this IP address."; m->m_cgi = "proxyip"; m->m_off = (char *)&cr.m_proxyIp - x; m->m_type = TYPE_IP; m->m_def = "0.0.0.0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "proxy port"; m->m_desc = "Retrieve pages from the proxy on " "this port."; m->m_cgi = "proxyport"; m->m_off = (char *)&cr.m_proxyPort - x; m->m_type = TYPE_LONG; m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; */ m->m_title = "make image thumbnails"; m->m_desc = "Try to find the best image on each page and " "store it as a thumbnail for presenting in the search " "results."; m->m_cgi = "mit"; m->m_off = (char *)&cr.m_makeImageThumbnails - x; m->m_type = TYPE_BOOL; // default to off since it slows things down to do this m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "max thumbnail width or height"; m->m_desc = "This is in pixels and limits the size of the thumbnail. " "Gigablast tries to make at least the width or the height " "equal to this maximum, but, unless the thumbnail is square, " "one side will be longer than the other."; m->m_cgi = "mtwh"; m->m_off = (char *)&cr.m_thumbnailMaxWidthHeight - x; m->m_type = TYPE_LONG; m->m_def = "250"; m->m_group = 0; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; m->m_title = "index spider status documents"; m->m_desc = "Index a spider status \"document\" " "for every url the spider " "attempts to spider. Search for them using special " "query operators like type:status or gberrorstr:success or " "stats:gberrornum to get a histogram. " "See syntax page for more examples. " "They will not otherwise " "show up in the search results."; // "This will not work for " // "diffbot crawlbot collections yet until it has proven " // "more stable."; m->m_cgi = "isr"; m->m_off = (char *)&cr.m_indexSpiderReplies - x; m->m_type = TYPE_BOOL; // default off for now until we fix it better. 5/26/14 mdw // turn back on 6/21 now that we do not index plain text terms // and we add gbdocspidertime and gbdocindextime terms so you // can use those to sort regular docs and not have spider reply // status docs in the serps. // back on 4/21/2015 seems pretty stable. // but it uses disk space so turn off for now again. 6/16/2015 m->m_def = "0"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE; m++; // i put this in here so i can save disk space for my global // diffbot json index m->m_title = "index body"; m->m_desc = "Index the body of the documents so you can search it. " "Required for searching that. You wil pretty much always " "want to keep this enabled. Does not apply to JSON " "documents."; m->m_cgi = "ib"; m->m_off = (char *)&cr.m_indexBody - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_flags = PF_CLONE ;//| PF_HIDDEN; m++; m->m_cgi = "apiUrl"; m->m_desc = "Send every spidered url to this url and index " "the reply in addition to the normal indexing process. " "Example: by specifying http://api.diffbot.com/v3/" "analyze?mode=high-precision&token= here " "you can index the structured JSON replies from diffbot for " "every url that is spidered. " "Gigablast will automatically " "append a &url= to this url " "before sending it to diffbot."; m->m_xml = "diffbotApiUrl"; m->m_title = "diffbot api url"; m->m_off = (char *)&cr.m_diffbotApiUrl - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_SPIDER; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m->m_def = ""; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_cgi = "urlProcessPatternTwo"; m->m_desc = "Only send urls that match this simple substring " "pattern to Diffbot. Separate substrings with two pipe " "operators, ||. Leave empty for no restrictions."; m->m_xml = "diffbotUrlProcessPattern"; m->m_title = "diffbot url process pattern"; m->m_off = (char *)&cr.m_diffbotUrlProcessPattern - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_group = 0; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m++; m->m_cgi = "urlProcessRegExTwo"; m->m_desc = "Only send urls that match this regular expression " "to Diffbot. " "Leave empty for no restrictions."; m->m_xml = "diffbotUrlProcessRegEx"; m->m_title = "diffbot url process regex"; m->m_off = (char *)&cr.m_diffbotUrlProcessRegEx - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_group = 0; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m++; m->m_cgi = "pageProcessPatternTwo"; m->m_desc = "Only send urls whose content matches this simple " "substring " "pattern to Diffbot. Separate substrings with two pipe " "operators, ||. " "Leave empty for no restrictions."; m->m_xml = "diffbotPageProcessPattern"; m->m_title = "diffbot page process pattern"; m->m_off = (char *)&cr.m_diffbotPageProcessPattern - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m->m_def = ""; m->m_group = 0; m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; m++; m->m_title = "spider start time"; m->m_desc = "Only spider URLs scheduled to be spidered " "at this time or after. In UTC."; m->m_cgi = "sta"; m->m_off = (char *)&cr.m_spiderTimeMin - x; m->m_type = TYPE_DATE; // date format -- very special m->m_def = "01 Jan 1970"; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "spider end time"; m->m_desc = "Only spider URLs scheduled to be spidered " "at this time or before. If \"use current time\" is true " "then the current local time is used for this value instead. " "in UTC."; m->m_cgi = "stb"; m->m_off = (char *)&cr.m_spiderTimeMax - x; m->m_type = TYPE_DATE2; m->m_def = "01 Jan 2010"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; m->m_title = "use current time"; m->m_desc = "Use the current time as the spider end time?"; m->m_cgi = "uct"; m->m_off = (char *)&cr.m_useCurrentTime - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_SPIDER; m->m_obj = OBJ_COLL; m++; /* m->m_title = "default ruleset site file num"; m->m_desc = "Use this as the current Sitedb file num for Sitedb " "entries that always use the current default"; m->m_cgi = "dftsfn"; m->m_off = (char *)&cr.m_defaultSiteRec - x; m->m_type = TYPE_LONG; m->m_def = "16"; m++; m->m_title = "RSS ruleset site file num"; m->m_desc = "Use this Sitedb file num ruleset for RSS feeds"; m->m_cgi = "rssrs"; m->m_off = (char *)&cr.m_rssSiteRec - x; m->m_type = TYPE_LONG; m->m_def = "25"; m->m_group = 0; m++; m->m_title = "TOC ruleset site file num"; m->m_desc = "Use this Sitedb file num ruleset " "for Table of Contents pages"; m->m_cgi = "tocrs"; m->m_off = (char *)&cr.m_tocSiteRec - x; m->m_type = TYPE_LONG; m->m_def = "29"; m->m_group = 0; m++; */ /* m->m_title = "store topics vector"; m->m_desc = "Should Gigablast compute and store a topics vector " "for every document indexed. This allows Gigablast to " "do topic clustering without having to compute this vector " "at query time. You can turn topic clustering on in the " "Search Controls page."; m->m_cgi = "utv"; m->m_off = (char *)&cr.m_useGigabitVector - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "use gigabits for vector"; m->m_desc = "For news collection. " "Should Gigablast form the similarity vector using " "Gigabits, as opposed to a straight out random sample. " "This does clustering more " "by topic rather than by explicit content in common."; m->m_cgi = "uct"; m->m_off = (char *)&cr.m_useGigabitVector - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; m->m_title = "max similarity to reindex"; m->m_desc = "If the url's content is over X% similar to what we " "already " "have indexed, then do not reindex it, and treat the content " "as if it were unchanged for intelligent spider scheduling " "purposes. Set to 100% to always reindex the document, " "regardless, although the use-ifModifiedSince check " "above may still be in affect, as well as the " "deduping-enabled check. This will also affect the re-spider " "time, because Gigablast spiders documents that change " "frequently faster."; m->m_cgi = "msti"; m->m_off = (char *)&cr.m_maxSimilarityToIndex - x; m->m_type = TYPE_LONG; m->m_def = "100"; m->m_group = 0; m++; */ // this is obsolete -- we can use the reg exp "isroot" /* m->m_title = "root url priority"; m->m_desc = "What spider priority should root urls " "be assigned? Spider priorities range from 0 to 31. If no " "urls are scheduled to be spidered in the priority 31 " "bracket, the spider moves down to 30, etc., until it finds " "a url to spider. If this priority is undefined " "then that url's priority is determined based on the rules " "on the URL filters page. If the priority is still " "undefined then the priority is taken to be the priority of " "the parent minus one, which results in a breadth first " "spidering algorithm."; // html m->m_cgi = "srup"; m->m_off = (char *)&cr.m_spiderdbRootUrlPriority - x; m->m_type = TYPE_PRIORITY2;// 0-(MAX_SPIDER_PRIORITIES-1)dropdown menu m->m_def = "15"; m++; */ /* -- mdw, now in urlfilters using "isaddurl" "reg exp" m->m_title = "add url priority"; m->m_desc = "What is the priority of a url which " "is added to the spider queue via the " "add url page?"; // html m->m_cgi = "saup"; m->m_off = (char *)&cr.m_spiderdbAddUrlPriority - x; m->m_type = TYPE_PRIORITY; // 0-(MAX_SPIDER_PRIORITIES-1)dropdown menu m->m_def = "16"; m->m_group = 0; m++; */ /* m->m_title = "new spider by priority"; m->m_desc = "Specify priorities for which " "new urls not yet in the index should be spidered."; m->m_cgi = "sn"; m->m_xml = "spiderNewBits"; m->m_off = (char *)&cr.m_spiderNewBits - x; m->m_type = TYPE_PRIORITY_BOXES; // array of numbered (0-(MAX_SPIDER_PRIORITIES-1)) checkboxes m->m_fixed = MAX_SPIDER_PRIORITIES; m->m_def = "1"; // default for each one is on m++; m->m_title = "old spider by priority"; m->m_desc = "Specify priorities for which old " "urls already in the index should be spidered."; m->m_cgi = "so"; m->m_xml = "spiderOldBits"; m->m_off = (char *)&cr.m_spiderOldBits - x; m->m_type = TYPE_PRIORITY_BOXES; // array of numbered (0-(MAX_SPIDER_PRIORITIES-1)) checkboxes m->m_fixed = MAX_SPIDER_PRIORITIES; m->m_def = "1"; // default for each one is on m->m_group = 0; m++; m->m_title = "max spiders per domain"; m->m_desc = "How many pages should the spider " "download simultaneously from any one domain? This can " "prevents the spider from hitting one server too hard."; m->m_cgi = "mspd"; m->m_off = (char *)&cr.m_maxSpidersPerDomain - x; m->m_type = TYPE_LONG; m->m_def = "1"; m++; m->m_title = "same domain wait"; m->m_desc = "How many milliseconds should Gigablast wait " "between spidering a second url from the same domain. " "This is used to prevent the spiders from hitting a " "website too hard."; m->m_cgi = "sdw"; m->m_off = (char *)&cr.m_sameDomainWait - x; m->m_type = TYPE_LONG; m->m_def = "500"; m->m_group = 0; m++; m->m_title = "same ip wait"; m->m_desc = "How many milliseconds should Gigablast wait " "between spidering a second url from the same IP address. " "This is used to prevent the spiders from hitting a " "website too hard."; m->m_cgi = "siw"; m->m_off = (char *)&cr.m_sameIpWait - x; m->m_type = TYPE_LONG; m->m_def = "10000"; m->m_group = 0; m++; */ /* m->m_title = "use distributed spider lock"; m->m_desc = "Enable distributed spider locking to strictly enforce " "same domain waits at a global level."; m->m_cgi = "udsl"; m->m_off = (char *)&cr.m_useSpiderLocks - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; m->m_title = "distribute spider download based on ip"; m->m_desc = "Distribute web downloads based on the ip of the host so " "only one spider ip hits the same hosting ip. Helps " "webmaster's logs look nicer."; m->m_cgi = "udsd"; m->m_off = (char*)&cr.m_distributeSpiderGet - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_group = 0; m++; m->m_title = "percent of water mark to reload queues"; m->m_desc = "When a spider queue drops below this percent of its " "max level it will reload from disk."; m->m_cgi = "rlqp"; m->m_off = (char*)&cr.m_reloadQueuePercent - x; m->m_type = TYPE_LONG; m->m_def = "25"; m++; */ /* m->m_title = "min respider wait"; m->m_desc = "What is the minimum number of days " "the spider should wait before re-visiting a particular " "web page? " "The spiders attempts to determine the update cycle of " "each web page and it tries to visit them as needed, but it " "will not wait less than this number of days regardless."; m->m_cgi = "mrw"; m->m_off = (char *)&cr.m_minRespiderWait - x; m->m_type = TYPE_FLOAT; m->m_def = "1.0"; m++; m->m_title = "max respider wait"; m->m_desc = "What is the maximum number of days " "the spider should wait before re-visiting a particular " "web page?"; m->m_cgi = "xrw"; m->m_off = (char *)&cr.m_maxRespiderWait - x; m->m_type = TYPE_FLOAT; m->m_def = "90.0"; m->m_group = 0; m++; m->m_title = "first respider wait"; m->m_desc = "What is the number of days " "Gigablast should wait before spidering a particular web page " "for the second time? Tag in ruleset will override this value " "if it is present."; m->m_cgi = "frw"; m->m_off = (char *)&cr.m_firstRespiderWait - x; m->m_type = TYPE_FLOAT; m->m_def = "30.0"; m->m_group = 0; m++; m->m_title = "error respider wait"; m->m_desc = "If a spidered web page has a network " "error, such as a DNS not found error, or a time out error, " "how many days should Gigablast wait before reattempting " "to spider that web page?"; m->m_cgi = "erw"; m->m_off = (char *)&cr.m_errorRespiderWait - x; m->m_type = TYPE_FLOAT; m->m_def = "2.0"; m->m_group = 0; m++; m->m_title = "doc not found error respider wait"; m->m_desc = "If a spidered web page has a http status " "error, such as a 404 page not found error, " "how many days should Gigablast wait before reattempting " "to spider that web page?"; m->m_cgi = "dnferw"; m->m_off = (char *)&cr.m_docNotFoundErrorRespiderWait - x; m->m_type = TYPE_FLOAT; m->m_def = "7.0"; m->m_group = 0; m++; */ /* m->m_title = "spider max kbps"; m->m_desc = "The maximum kilobits per second " "that the spider can download."; m->m_cgi = "cmkbps"; m->m_off = (char *)&cr.m_maxKbps - x; m->m_type = TYPE_FLOAT; m->m_def = "999999.0"; m++; m->m_title = "spider max pages per second"; m->m_desc = "The maximum number of pages per " "second that can be indexed or deleted from the index."; m->m_cgi = "cmpps"; m->m_off = (char *)&cr.m_maxPagesPerSecond - x; m->m_type = TYPE_FLOAT; m->m_def = "999999.0"; m->m_group = 0; m++; */ /* m->m_title = "spider new percent"; m->m_desc = "Approximate percentage of new vs. old docs to spider. " "If set to a negative number, the old alternating " "priority algorithm is used."; m->m_cgi = "snp"; m->m_off = (char *)&cr.m_spiderNewPct - x; m->m_type = TYPE_FLOAT; m->m_def = "-1.0"; m->m_group = 0; m++; */ /* m->m_title = "number retries per url"; m->m_desc = "How many times should the spider be " "allowed to fail to download a particular web page before " "it gives up? " "Failure may result from temporary loss of internet " "connectivity on the remote end, dns or routing problems."; m->m_cgi = "nr"; m->m_off = (char *)&cr.m_numRetries - x; m->m_type = TYPE_RETRIES; // dropdown from 0 to 3 m->m_def = "1"; m++; m->m_title = "priority of urls being retried"; m->m_desc = "Keep this pretty high so that we get problem urls " "out of the index fast, otherwise, you might be waiting " "months for another retry. Use undefined to indicate " "no change in the priority of the url."; m->m_cgi = "rtp"; m->m_off = (char *)&cr.m_retryPriority - x; m->m_type = TYPE_PRIORITY2; // -1 to 31 m->m_def = "-1"; m->m_group = 0; m++; m->m_title = "max pages in index"; m->m_desc = "What is the maximum number of " "pages that are permitted for this collection?"; m->m_cgi = "mnp"; m->m_off = (char *)&cr.m_maxNumPages - x; m->m_type = TYPE_LONG_LONG; m->m_def = "10000000000"; // 10 billion m++; m->m_title = "import link info"; // from other cluster"; m->m_desc = "Say yes here to make Gigablast import " "link text from another collection into this one " "when spidering urls. Gigablast will " "use the hosts.conf file in the working directory to " "tell it what hosts belong to the cluster to import from. " "Gigablast " "will use the \"update link votes frequency\" parm above " "to determine if the info should be recomputed on the other " "cluster."; m->m_cgi = "eli"; // external link info m->m_off = (char *)&cr.m_getExternalLinkInfo - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 2; m++; m->m_title = "use hosts2.conf for import cluster"; m->m_desc = "Tell Gigablast to import from the cluster defined by " "hosts2.conf in the working directory, rather than " "hosts.conf"; m->m_cgi = "elib"; // external link info m->m_off = (char *)&cr.m_importFromHosts2Conf - x; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_priv = 2; m->m_group = 0; m++; //m->m_title = "get link info from other cluster in real-time"; //m->m_desc = "Say yes here to make Gigablast tell the other " // "cluster to compute the link info, not just return a " // "stale copy from the last time it computed it."; //m->m_cgi = "elif"; // external link info fresh //m->m_off = (char *)&cr.m_getExternalLinkInfoFresh - x; //m->m_type = TYPE_BOOL; //m->m_def = "0"; //m->m_group = 0; //m->m_priv = 2; //m++; m->m_title = "collection to import from"; m->m_desc = "Gigablast will fetch the link info from this " "collection."; m->m_cgi = "elic"; // external link info m->m_off = (char *)&cr.m_externalColl - x; m->m_type = TYPE_STRING; m->m_size = MAX_COLL_LEN+1; m->m_def = ""; m->m_group = 0; m->m_priv = 2; m++; m->m_title = "turk tags to display"; m->m_desc = "Tell pageturk to display the tag questions " "for the comma separated tag names." " no space allowed."; m->m_cgi = "ttags"; m->m_xml = "turkTags"; m->m_type = TYPE_STRING; m->m_size = 256; m->m_def = "blog,spam,news"; m->m_off = (char *)&cr.m_turkTags - x; m->m_group = 0; m->m_priv = 2; m++; */ /* // now we store this in title recs, so we can change it on the fly m->m_title = "title weight"; m->m_desc = "Weight title this much more or less. This units are " "percentage. A 100 means to not give the title any special " "weight. Generally, though, you want to give it significantly " "more weight than that, so 2400 is the default."; m->m_cgi = "tw"; m->m_off = (char *)&cr.m_titleWeight - x; m->m_type = TYPE_LONG; m->m_def = "4600"; m->m_min = 0; m++; // now we store this in title recs, so we can change it on the fly m->m_title = "header weight"; m->m_desc = "Weight terms in header tags by this much more or less. " "This units are " "percentage. A 100 means to not give the header any special " "weight. Generally, though, you want to give it significantly " "more weight than that, so 600 is the default."; m->m_cgi = "hw"; m->m_off = (char *)&cr.m_headerWeight - x; m->m_type = TYPE_LONG; m->m_def = "600"; m->m_min = 0; m->m_group = 0; m++; // now we store this in title recs, so we can change it on the fly m->m_title = "url path word weight"; m->m_desc = "Weight text in url path this much more. " "The units are " "percentage. A 100 means to not give any special " "weight. Generally, though, you want to give it significantly " "more weight than that, so 600 is the default."; m->m_cgi = "upw"; m->m_off = (char *)&cr.m_urlPathWeight - x; m->m_type = TYPE_LONG; m->m_def = "1600"; m->m_min = 0; m->m_group = 0; m++; // now we store this in title recs, so we can change it on the fly m->m_title = "external link text weight"; m->m_desc = "Weight text in the incoming external link text this " "much more. The units are percentage. It already receives a " "decent amount of weight naturally."; m->m_cgi = "eltw"; m->m_off = (char *)&cr.m_externalLinkTextWeight - x; m->m_type = TYPE_LONG; m->m_def = "600"; m->m_min = 0; m->m_group = 0; m++; // now we store this in title recs, so we can change it on the fly m->m_title = "internal link text weight"; m->m_desc = "Weight text in the incoming internal link text this " "much more. The units are percentage. It already receives a " "decent amount of weight naturally."; m->m_cgi = "iltw"; m->m_off = (char *)&cr.m_internalLinkTextWeight - x; m->m_type = TYPE_LONG; m->m_def = "200"; m->m_min = 0; m->m_group = 0; m++; // now we store this in title recs, so we can change it on the fly m->m_title = "concept weight"; m->m_desc = "Weight concepts this much more. " "The units are " "percentage. It already receives a decent amount of weight " "naturally. AKA: surrounding text boost."; m->m_cgi = "cw"; m->m_off = (char *)&cr.m_conceptWeight - x; m->m_type = TYPE_LONG; m->m_def = "50"; m->m_min = 0; m->m_group = 0; m++; */ /* // now we store this in title recs, so we can change it on the fly m->m_title = "site num inlinks boost base"; m->m_desc = "Boost the score of all terms in the document using " "this number. " "The boost itself is expressed as a percentage. " "The boost is B^X, where X is the number of good " "inlinks to the document's site " "and B is this is this boost base. " "The score of each term in the " "document is multiplied by the boost. That product " "becomes the new score of that term. " "For purposes of this calculation we limit X to 1000."; m->m_cgi = "qbe"; m->m_off = (char *)&cr.m_siteNumInlinksBoostBase - x; m->m_type = TYPE_FLOAT; m->m_def = "1.005"; m->m_min = 0; m->m_group = 0; m++; */ /* // use menu elimination technology? m->m_title = "only index article content"; m->m_desc = "If this is true gigablast will only index the " "article content on pages identifed as permalinks. It will " "NOT index any page content on non-permalink pages, and it " "will avoid indexing menu content on any page. It will not " "index meta tags on any page. It will only index incoming " "link text for permalink pages. Useful when " "indexing blog or news sites."; m->m_cgi = "met"; m->m_off = (char *)&cr.m_eliminateMenus - x; m->m_type = TYPE_BOOL; m->m_def = "0"; m++; */ // replace by lang== lang!= in url filters //m->m_title = "collection language"; //m->m_desc = "Only spider pages determined to be in " // "this language (see Language.h)"; //m->m_cgi = "clang"; //m->m_off = (char *)&cr.m_language - x; //m->m_type = TYPE_LONG; //m->m_def = "0"; //m++; //////////////// // END PAGE SPIDER CONTROLS //////////////// /////////////////////////////////////////// // PAGE REPAIR CONTROLS /////////////////////////////////////////// m->m_title = "rebuild mode enabled"; m->m_desc = "If enabled, gigablast will rebuild the rdbs as " "specified by the parameters below. When a particular " "collection is in rebuild mode, it can not spider or merge " "titledb files."; m->m_cgi = "rme"; m->m_off = (char *)&g_conf.m_repairingEnabled - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_sync = false; // do not sync this parm m++; m->m_title = "collection to rebuild"; m->m_xml = "collectionToRebuild"; m->m_desc = "Name of collection to rebuild."; // m->m_desc = "Comma or space separated list of the collections " // "to rebuild."; m->m_cgi = "rctr"; // repair collections to repair m->m_off = (char *)&g_conf.m_collsToRepair - g; m->m_type = TYPE_SAFEBUF;//STRING; //m->m_size = 1024; m->m_def = ""; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_group = 0; m->m_flags = PF_REQUIRED;// | PF_COLLDEFAULT;//| PF_NOHTML; m++; m->m_title = "rebuild ALL collections"; m->m_desc = "If enabled, gigablast will rebuild all collections."; m->m_cgi = "rac"; m->m_off = (char *)&g_conf.m_rebuildAllCollections - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "memory to use for rebuild"; m->m_desc = "In bytes."; m->m_cgi = "rmtu"; // repair mem to use m->m_off = (char *)&g_conf.m_repairMem - g; m->m_type = TYPE_LONG; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "200000000"; m->m_units = "bytes"; m->m_group = 0; m++; m->m_title = "max rebuild injections"; m->m_desc = "Maximum number of outstanding injections for " "rebuild."; m->m_cgi = "mrps"; m->m_off = (char *)&g_conf.m_maxRepairSpiders - g; m->m_type = TYPE_LONG; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "2"; m->m_group = 0; m++; m->m_title = "full rebuild"; m->m_desc = "If enabled, gigablast will reinject the content of " "all title recs into a secondary rdb system. That will " "the primary rdb system when complete."; m->m_cgi = "rfr"; // repair full rebuild m->m_off = (char *)&g_conf.m_fullRebuild - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "1"; m->m_group = 0; m++; m->m_title = "add spiderdb recs of non indexed urls"; m->m_desc = "If enabled, gigablast will add the spiderdb " "records of unindexed urls " "when doing the full rebuild or the spiderdb " "rebuild. Otherwise, only the indexed urls will get " "spiderdb records in spiderdb. This can be faster because " "Gigablast does not have to do an IP lookup on every url " "if its IP address is not in tagdb already."; m->m_cgi = "rfrknsx"; m->m_off = (char *)&g_conf.m_rebuildAddOutlinks - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "recycle link text"; m->m_desc = "If enabled, gigablast will recycle the link text " "when rebuilding titledb. " "The siterank, which is determined by the " "number of inlinks to a site, is stored/cached in tagdb " "so that is a separate item. If you want to pick up new " "link text you will want to set this to NO and " "make sure to rebuild titledb, since that stores the " "link text."; m->m_cgi = "rrli"; // repair full rebuild m->m_off = (char *)&g_conf.m_rebuildRecycleLinkInfo - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "1"; m->m_group = 0; m++; /* m->m_title = "recycle imported link info"; m->m_desc = "If enabled, gigablast will recycle the imported " "link info when rebuilding titledb."; m->m_cgi = "rrlit"; // repair full rebuild m->m_off = (char *)&g_conf.m_rebuildRecycleLinkInfo2 - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "1"; m->m_group = 0; m++; */ /* m->m_title = "remove bad pages"; m->m_desc = "If enabled, gigablast just scans the titledb recs " "in the given collection and removes those that are " "banned or filtered according to the url filters table. It " "will also lookup in tagdb."; m->m_cgi = "rbadp"; m->m_off = (char *)&g_conf.m_removeBadPages - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m++; */ m->m_title = "rebuild titledb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrt"; // repair rebuild titledb m->m_off = (char *)&g_conf.m_rebuildTitledb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m++; /* m->m_title = "rebuild tfndb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rru"; // repair rebuild tfndb m->m_off = (char *)&g_conf.m_rebuildTfndb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild indexdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rri"; m->m_off = (char *)&g_conf.m_rebuildIndexdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; */ m->m_title = "rebuild posdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rri"; m->m_off = (char *)&g_conf.m_rebuildPosdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_group = 0; m++; /* m->m_title = "rebuild no splits"; m->m_desc = "If enabled, gigablast will just re-add the no split " "lists from all the current title recs back into indexdb."; m->m_cgi = "rns"; m->m_off = (char *)&g_conf.m_rebuildNoSplits - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild datedb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrd"; m->m_off = (char *)&g_conf.m_rebuildDatedb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild checksumdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrch"; m->m_off = (char *)&g_conf.m_rebuildChecksumdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; */ m->m_title = "rebuild clusterdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrcl"; m->m_off = (char *)&g_conf.m_rebuildClusterdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild spiderdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrsp"; m->m_off = (char *)&g_conf.m_rebuildSpiderdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_group = 0; m++; /* m->m_title = "rebuild tagdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrsi"; m->m_off = (char *)&g_conf.m_rebuildSitedb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; */ m->m_title = "rebuild linkdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrld"; m->m_off = (char *)&g_conf.m_rebuildLinkdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_group = 0; m++; /* m->m_title = "rebuild tagdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrtgld"; m->m_off = (char *)&g_conf.m_rebuildTagdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild placedb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrpld"; m->m_off = (char *)&g_conf.m_rebuildPlacedb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild timedb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrtmd"; m->m_off = (char *)&g_conf.m_rebuildTimedb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild sectiondb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrsnd"; m->m_off = (char *)&g_conf.m_rebuildSectiondb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; m->m_title = "rebuild revdb"; m->m_desc = "If enabled, gigablast will rebuild this rdb"; m->m_cgi = "rrrvd"; m->m_off = (char *)&g_conf.m_rebuildRevdb - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_def = "0"; m->m_group = 0; m++; */ m->m_title = "rebuild root urls"; m->m_desc = "If disabled, gigablast will skip root urls."; m->m_cgi = "ruru"; m->m_off = (char *)&g_conf.m_rebuildRoots - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "1"; m++; m->m_title = "rebuild non-root urls"; m->m_desc = "If disabled, gigablast will skip non-root urls."; m->m_cgi = "runru"; m->m_off = (char *)&g_conf.m_rebuildNonRoots - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "1"; m->m_group = 0; m++; /* m->m_title = "skip tagdb lookup"; m->m_desc = "When rebuilding spiderdb and scanning it for new " "spiderdb records, should a tagdb lookup be performed? " "Runs much much " "faster without it. Will also keep the original doc quality " "and " "spider priority in tact."; m->m_cgi = "rssl"; m->m_off = (char *)&g_conf.m_rebuildSkipSitedbLookup - g; m->m_type = TYPE_BOOL; m->m_page = PAGE_REPAIR; m->m_obj = OBJ_CONF; m->m_def = "0"; m->m_group = 0; m++; */ /////////////////////////////////////////// // END PAGE REPAIR // /////////////////////////////////////////// /////////////////////////////////////////// // AUTOBAN CONTROLS // /////////////////////////////////////////// m->m_title = "ban IPs"; m->m_desc = "add Ips here to bar them from accessing this " "gigablast server."; m->m_cgi = "banIps"; m->m_xml = "banIps"; m->m_off = (char *)g_conf.m_banIps - g; m->m_type = TYPE_STRINGBOX; m->m_page = PAGE_AUTOBAN; m->m_obj = OBJ_CONF; m->m_size = AUTOBAN_TEXT_SIZE; m->m_group = 1; m->m_def = ""; m->m_plen = (char *)&g_conf.m_banIpsLen - g; // length of string m++; m->m_title = "allow IPs"; m->m_desc = "add Ips here to give them an infinite query quota."; m->m_cgi = "allowIps"; m->m_xml = "allowIps"; m->m_off = (char *)g_conf.m_allowIps - g; m->m_type = TYPE_STRINGBOX; m->m_page = PAGE_AUTOBAN; m->m_size = AUTOBAN_TEXT_SIZE; m->m_group = 1; m->m_def = ""; m->m_plen = (char *)&g_conf.m_allowIpsLen - g; // length of string m->m_obj = OBJ_CONF; m++; m->m_title = "valid search codes"; m->m_desc = "Don't try to autoban queries that have one " "of these codes. Also, the code must be valid for us " "to use &uip=IPADDRESS as the IP address of the submitter " "for purposes of autoban AND purposes of addurl daily quotas."; m->m_cgi = "validCodes"; m->m_xml = "validCodes"; m->m_off = (char *)g_conf.m_validCodes - g; m->m_type = TYPE_STRINGBOX; m->m_page = PAGE_AUTOBAN; m->m_size = AUTOBAN_TEXT_SIZE; m->m_group = 1; m->m_def = ""; m->m_plen = (char *)&g_conf.m_validCodesLen - g; // length of string m->m_obj = OBJ_CONF; m++; m->m_title = "Extra Parms"; m->m_desc = "Append extra default parms to queries that match " "certain substrings. Format: text to match in url, " "followed by a space, then the list of extra parms as " "they would appear appended to the url. " "One match per line."; m->m_cgi = "extraParms"; m->m_xml = "extraParms"; m->m_off = (char *)g_conf.m_extraParms - g; m->m_type = TYPE_STRINGBOX; m->m_page = PAGE_AUTOBAN; m->m_size = AUTOBAN_TEXT_SIZE; m->m_group = 1; m->m_def = ""; m->m_plen = (char *)&g_conf.m_extraParmsLen - g; // length of string m->m_obj = OBJ_CONF; m++; m->m_title = "ban substrings"; m->m_desc = "ban any query that matches this list of " "substrings. Must match all comma-separated strings " "on the same line. ('\\n' = OR, ',' = AND)"; m->m_cgi = "banRegex"; m->m_xml = "banRegex"; m->m_off = (char *)g_conf.m_banRegex - g; m->m_type = TYPE_STRINGBOX; m->m_page = PAGE_AUTOBAN; m->m_size = AUTOBAN_TEXT_SIZE; m->m_group = 1; m->m_def = ""; m->m_plen = (char *)&g_conf.m_banRegexLen - g; // length of string m->m_obj = OBJ_CONF; m++; ///////////// // END AUTOBAN CONTROLS ///////////// /////////////////////////////////////////// // ROOT PASSWORDS page /////////////////////////////////////////// m->m_title = "Master Passwords"; m->m_desc = "Whitespace separated list of passwords. " "Any matching password will have administrative access " "to Gigablast and all collections."; //"If no Admin Password or Admin IP is specified then " //"Gigablast will only allow local IPs to connect to it " //"as the master admin."; m->m_cgi = "masterpwds"; m->m_xml = "masterPasswords"; m->m_def = ""; m->m_obj = OBJ_CONF; m->m_off = (char *)&g_conf.m_masterPwds - g; m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY; m->m_page = PAGE_MASTERPASSWORDS; //m->m_max = MAX_MASTER_PASSWORDS; //m->m_size = PASSWORD_MAX_LEN+1; //m->m_addin = 1; // "insert" follows? m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA; m++; m->m_title = "Master IPs"; //m->m_desc = "Allow UDP requests from this list of IPs. Any datagram " // "received not coming from one of these IPs, or an IP in " // "hosts.conf, is dropped. If another cluster is accessing this " // "cluster for getting link text or whatever, you will need to " // "list the IPs of the accessing machines here. These IPs are " // "also used to allow access to the HTTP server even if it " // "was disabled in the Master Controls. IPs that have 0 has " // "their Least Significant Byte are treated as wildcards for " // "IP blocks. That is, 1.2.3.0 means 1.2.3.*."; m->m_desc = "Whitespace separated list of Ips. " "Any IPs in this list will have administrative access " "to Gigablast and all collections."; m->m_cgi = "masterips"; m->m_xml = "masterIps"; m->m_page = PAGE_MASTERPASSWORDS; m->m_off = (char *)&g_conf.m_connectIps - g; m->m_type = TYPE_SAFEBUF;//IP; m->m_def = ""; //m->m_max = MAX_CONNECT_IPS; //m->m_priv = 2; //m->m_addin = 1; // "insert" follows? //m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_obj = OBJ_CONF; m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA; m++; // m->m_title = "remove connect ip"; // m->m_desc = "remove a connect ip"; // m->m_cgi = "removeip"; // m->m_type = TYPE_CMD; // m->m_page = PAGE_NONE; // m->m_func = CommandRemoveConnectIpRow; // m->m_cast = 1; // m->m_obj = OBJ_CONF; // m++; // m->m_title = "remove a password"; // m->m_desc = "remove a password"; // m->m_cgi = "removepwd"; // m->m_type = TYPE_CMD; // m->m_page = PAGE_NONE; // m->m_func = CommandRemovePasswordRow; // m->m_cast = 1; // m->m_obj = OBJ_CONF; // m++; /* m->m_title = "Super Turks"; m->m_desc = "Add facebook user IDs here so those people can " "turk the results. Later we may limit each person to " "turking a geographic region."; m->m_cgi = "supterturks"; m->m_xml = "supterturks"; m->m_def = ""; m->m_off = (char *)&g_conf.m_superTurks - g; m->m_type = TYPE_STRINGBOX; m->m_perms = PAGE_MASTER; m->m_size = USERS_TEXT_SIZE; m->m_plen = (char *)&g_conf.m_superTurksLen - g; m->m_page = PAGE_MASTERPASSWORDS; m->m_flags = PF_HIDDEN | PF_NOSAVE; m++; */ /* m->m_title = "Users"; m->m_desc = "Add users here. The format is " "collection:ip:username:password:relogin:pages:tagnames" " Username and password cannot be blank." " You can specify " "* for collection to indicate all collections. " " * can be used in IP as wildcard. " " * in pages means user has access to all pages. Also" " you can specify individual pages. A \'-\' sign at the" " start of page means user is not allowed to access that" " page. Please refer the page reference table at the bottom " "of this page for available pages. If you want to just login " " once and avoid relogin for gb shutdowns then set relogin=1," " else set it to 0. If relogin is 1 your login will never expire either." "
" " Ex: 1. master user -> *:*:master:master:1:*:english
" " 2. public user -> *:*:public:1234:0:index.html" ",get,search,login,dir:english
" "3. turk user -> 66.28.58.122:main:turk:1234:0:pageturkhome," "pageturk,pageturkget,get,login:english"; m->m_cgi = "users"; m->m_xml = "users"; m->m_off = (char *)&g_conf.m_users - g; m->m_type = TYPE_STRINGBOX; m->m_perms = PAGE_MASTER; m->m_size = USERS_TEXT_SIZE; m->m_plen = (char *)&g_conf.m_usersLen - g; m->m_page = PAGE_MASTERPASSWORDS; m++; */ /* m->m_title = "Master IPs"; m->m_desc = "If someone connects from one of these IPs " "then they will have full " "master administrator privileges. " "If no IPs are specified, then master administrators can " "get access for any IP. " "Connecting from 127.0.0.1 always grants master privledges. " "If no Master Password or Master IP is specified then " "Gigablast will assign a default password of footbar23."; m->m_cgi = "masterip"; m->m_xml = "masterIp"; m->m_max = MAX_MASTER_IPS; m->m_off = (char *)g_conf.m_masterIps - g; m->m_type = TYPE_IP; m++; */ m->m_title = "Collection Passwords"; m->m_desc = "Whitespace separated list of passwords. " "Any matching password will have administrative access " "to the controls for just this collection. The master " "password and IPs are controlled through the " "master passwords link under the ADVANCED controls " "tab. The master passwords or IPs have administrative " "access to all collections."; m->m_cgi = "collpwd"; m->m_xml = "collectionPasswords"; m->m_obj = OBJ_COLL; m->m_off = (char *)&cr.m_collectionPasswords - x; m->m_def = ""; m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY; m->m_page = PAGE_COLLPASSWORDS; m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA; m++; m->m_title = "Collection IPs"; m->m_desc = "Whitespace separated list of IPs. " "Any matching IP will have administrative access " "to the controls for just this collection."; m->m_cgi = "collips"; m->m_xml = "collectionIps"; m->m_obj = OBJ_COLL; m->m_off = (char *)&cr.m_collectionIps - x; m->m_def = ""; m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY; m->m_page = PAGE_COLLPASSWORDS; m->m_flags = PF_PRIVATE | PF_TEXTAREA | PF_SMALLTEXTAREA; m++; ////// // END SECURITY CONTROLS ////// /////////////////////////////////////////// // LOG CONTROLS /////////////////////////////////////////// m->m_title = "log http requests"; m->m_desc = "Log GET and POST requests received from the " "http server?"; m->m_cgi = "hr"; m->m_off = (char *)&g_conf.m_logHttpRequests - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log autobanned queries"; m->m_desc = "Should we log queries that are autobanned? " "They can really fill up the log."; m->m_cgi = "laq"; m->m_off = (char *)&g_conf.m_logAutobannedQueries - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log query time threshold"; m->m_desc = "If query took this many millliseconds or longer, then log the " "query and the time it took to process."; m->m_cgi = "lqtt"; m->m_off = (char *)&g_conf.m_logQueryTimeThreshold- g; m->m_type = TYPE_LONG; m->m_def = "5000"; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log query reply"; m->m_desc = "Log query reply in proxy, but only for those queries " "above the time threshold above."; m->m_cgi = "lqr"; m->m_off = (char *)&g_conf.m_logQueryReply - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_group = 0; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log spidered urls"; m->m_desc = "Log status of spidered or injected urls?"; m->m_cgi = "lsu"; m->m_off = (char *)&g_conf.m_logSpideredUrls - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log network congestion"; m->m_desc = "Log messages if Gigablast runs out of udp sockets?"; m->m_cgi = "lnc"; m->m_off = (char *)&g_conf.m_logNetCongestion - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log informational messages"; m->m_desc = "Log messages not related to an error condition, " "but meant more to give an idea of the state of " "the gigablast process. These can be useful when " "diagnosing problems."; m->m_cgi = "li"; m->m_off = (char *)&g_conf.m_logInfo - g; m->m_type = TYPE_BOOL; m->m_def = "1"; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log limit breeches"; m->m_desc = "Log it when document not added due to quota " "breech. Log it when url is too long and it gets " "truncated."; m->m_cgi = "ll"; m->m_off = (char *)&g_conf.m_logLimits - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug admin messages"; m->m_desc = "Log various debug messages."; m->m_cgi = "lda"; m->m_off = (char *)&g_conf.m_logDebugAdmin - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug build messages"; m->m_cgi = "ldb"; m->m_off = (char *)&g_conf.m_logDebugBuild - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug build time messages"; m->m_cgi = "ldbt"; m->m_off = (char *)&g_conf.m_logDebugBuildTime - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug database messages"; m->m_cgi = "ldd"; m->m_off = (char *)&g_conf.m_logDebugDb - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug dirty messages"; m->m_cgi = "lddm"; m->m_off = (char *)&g_conf.m_logDebugDirty - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug disk messages"; m->m_cgi = "lddi"; m->m_off = (char *)&g_conf.m_logDebugDisk - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug disk page cache"; m->m_cgi = "ldpc"; m->m_off = (char *)&g_conf.m_logDebugDiskPageCache - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug dns messages"; m->m_cgi = "lddns"; m->m_off = (char *)&g_conf.m_logDebugDns - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug http messages"; m->m_cgi = "ldh"; m->m_off = (char *)&g_conf.m_logDebugHttp - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug image messages"; m->m_cgi = "ldi"; m->m_off = (char *)&g_conf.m_logDebugImage - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug loop messages"; m->m_cgi = "ldl"; m->m_off = (char *)&g_conf.m_logDebugLoop - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug language detection messages"; m->m_cgi = "ldg"; m->m_off = (char *)&g_conf.m_logDebugLang - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug link info"; m->m_cgi = "ldli"; m->m_off = (char *)&g_conf.m_logDebugLinkInfo - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug mem messages"; m->m_cgi = "ldm"; m->m_off = (char *)&g_conf.m_logDebugMem - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug mem usage messages"; m->m_cgi = "ldmu"; m->m_off = (char *)&g_conf.m_logDebugMemUsage - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug net messages"; m->m_cgi = "ldn"; m->m_off = (char *)&g_conf.m_logDebugNet - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug post query rerank messages"; m->m_cgi = "ldpqr"; m->m_off = (char *)&g_conf.m_logDebugPQR - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug query messages"; m->m_cgi = "ldq"; m->m_off = (char *)&g_conf.m_logDebugQuery - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug quota messages"; m->m_cgi = "ldqta"; m->m_off = (char *)&g_conf.m_logDebugQuota - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug robots messages"; m->m_cgi = "ldr"; m->m_off = (char *)&g_conf.m_logDebugRobots - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug spider cache messages"; m->m_cgi = "lds"; m->m_off = (char *)&g_conf.m_logDebugSpcache - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; /* m->m_title = "log debug spider wait messages"; m->m_cgi = "ldspw"; m->m_off = (char *)&g_conf.m_logDebugSpiderWait - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m++; */ m->m_title = "log debug speller messages"; m->m_cgi = "ldsp"; m->m_off = (char *)&g_conf.m_logDebugSpeller - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug sections messages"; m->m_cgi = "ldscc"; m->m_off = (char *)&g_conf.m_logDebugSections - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug seo insert messages"; m->m_cgi = "ldsi"; m->m_off = (char *)&g_conf.m_logDebugSEOInserts - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug seo messages"; m->m_cgi = "ldseo"; m->m_off = (char *)&g_conf.m_logDebugSEO - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug stats messages"; m->m_cgi = "ldst"; m->m_off = (char *)&g_conf.m_logDebugStats - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug summary messages"; m->m_cgi = "ldsu"; m->m_off = (char *)&g_conf.m_logDebugSummary - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug spider messages"; m->m_cgi = "ldspid"; m->m_off = (char *)&g_conf.m_logDebugSpider - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug msg13 messages"; m->m_cgi = "ldspmth"; m->m_off = (char *)&g_conf.m_logDebugMsg13 - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "disable host0 for msg13 reception hack"; m->m_cgi = "dmth"; m->m_off = (char *)&g_conf.m_diffbotMsg13Hack - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug spider proxies"; m->m_cgi = "ldspr"; m->m_off = (char *)&g_conf.m_logDebugProxies - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug url attempts"; m->m_cgi = "ldspua"; m->m_off = (char *)&g_conf.m_logDebugUrlAttempts - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug spider downloads"; m->m_cgi = "ldsd"; m->m_off = (char *)&g_conf.m_logDebugDownloads - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug facebook"; m->m_cgi = "ldfb"; m->m_off = (char *)&g_conf.m_logDebugFacebook - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug tagdb messages"; m->m_cgi = "ldtm"; m->m_off = (char *)&g_conf.m_logDebugTagdb - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug tcp messages"; m->m_cgi = "ldt"; m->m_off = (char *)&g_conf.m_logDebugTcp - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug tcp buffer messages"; m->m_cgi = "ldtb"; m->m_off = (char *)&g_conf.m_logDebugTcpBuf - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug thread messages"; m->m_cgi = "ldth"; m->m_off = (char *)&g_conf.m_logDebugThread - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug title messages"; m->m_cgi = "ldti"; m->m_off = (char *)&g_conf.m_logDebugTitle - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug timedb messages"; m->m_cgi = "ldtim"; m->m_off = (char *)&g_conf.m_logDebugTimedb - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug topic messages"; m->m_cgi = "ldto"; m->m_off = (char *)&g_conf.m_logDebugTopics - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug topDoc messages"; m->m_cgi = "ldtopd"; m->m_off = (char *)&g_conf.m_logDebugTopDocs - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug udp messages"; m->m_cgi = "ldu"; m->m_off = (char *)&g_conf.m_logDebugUdp - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug unicode messages"; m->m_cgi = "ldun"; m->m_off = (char *)&g_conf.m_logDebugUnicode - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug repair messages"; m->m_cgi = "ldre"; m->m_off = (char *)&g_conf.m_logDebugRepair - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log debug pub date extraction messages"; m->m_cgi = "ldpd"; m->m_off = (char *)&g_conf.m_logDebugDate - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log timing messages for build"; m->m_desc = "Log various timing related messages."; m->m_cgi = "ltb"; m->m_off = (char *)&g_conf.m_logTimingBuild - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log timing messages for admin"; m->m_desc = "Log various timing related messages."; m->m_cgi = "ltadm"; m->m_off = (char *)&g_conf.m_logTimingAdmin - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log timing messages for database"; m->m_cgi = "ltd"; m->m_off = (char *)&g_conf.m_logTimingDb - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log timing messages for network layer"; m->m_cgi = "ltn"; m->m_off = (char *)&g_conf.m_logTimingNet - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log timing messages for query"; m->m_cgi = "ltq"; m->m_off = (char *)&g_conf.m_logTimingQuery - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log timing messages for spcache"; m->m_desc = "Log various timing related messages."; m->m_cgi = "ltspc"; m->m_off = (char *)&g_conf.m_logTimingSpcache - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log timing messages for related topics"; m->m_cgi = "ltt"; m->m_off = (char *)&g_conf.m_logTimingTopics - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; m->m_title = "log reminder messages"; m->m_desc = "Log reminders to the programmer. You do not need this."; m->m_cgi = "lr"; m->m_off = (char *)&g_conf.m_logReminders - g; m->m_type = TYPE_BOOL; m->m_def = "0"; m->m_priv = 1; m->m_page = PAGE_LOG; m->m_obj = OBJ_CONF; m++; ///// // END PAGE LOG CONTROLS ///// // END PARMS PARM END PARMS END m_numParms = m - m_parms; // sanity check if ( m_numParms >= MAX_PARMS ) { log("admin: Boost MAX_PARMS."); exit(-1); } // make xml tag names and store in here static char s_tbuf [ 18000 ]; char *p = s_tbuf; char *pend = s_tbuf + 18000; int32_t size; char t; // . set hashes of title // . used by Statsdb.cpp for identifying a parm for ( int32_t i = 0 ; i < m_numParms ; i++ ) { if ( ! m_parms[i].m_title ) continue; m_parms[i].m_hash = hash32n ( m_parms[i].m_title ); } // cgi hashes for ( int32_t i = 0 ; i < m_numParms ; i++ ) { if ( ! m_parms[i].m_cgi ) continue; m_parms[i].m_cgiHash = hash32n ( m_parms[i].m_cgi ); } // sanity check: ensure all cgi parms are different for ( int32_t i = 0 ; i < m_numParms ; i++ ) { for ( int32_t j = 0 ; j < m_numParms ; j++ ) { if ( j == i ) continue; if ( m_parms[i].m_type == TYPE_BOOL2 ) continue; if ( m_parms[j].m_type == TYPE_BOOL2 ) continue; if ( m_parms[i].m_type == TYPE_CMD ) continue; if ( m_parms[j].m_type == TYPE_CMD ) continue; if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue; if ( m_parms[j].m_type == TYPE_FILEUPLOADBUTTON ) continue; if ( m_parms[i].m_obj == OBJ_NONE ) continue; if ( m_parms[j].m_obj == OBJ_NONE ) continue; if ( m_parms[i].m_flags & PF_DUP ) continue; if ( m_parms[j].m_flags & PF_DUP ) continue; // hack to allow "c" for search, inject, addurls if ( m_parms[j].m_page != m_parms[i].m_page && m_parms[i].m_obj != OBJ_COLL && m_parms[i].m_obj != OBJ_CONF ) continue; if ( ! m_parms[i].m_cgi ) continue; if ( ! m_parms[j].m_cgi ) continue; // gotta be on same page now i guess int32_t obj1 = m_parms[i].m_obj; int32_t obj2 = m_parms[j].m_obj; if ( obj1 != OBJ_COLL && obj1 != OBJ_CONF ) continue; if ( obj2 != OBJ_COLL && obj2 != OBJ_CONF ) continue; //if ( m_parms[i].m_page != m_parms[j].m_page ) continue; // a different m_scmd means a different cgi parm really... //if ( m_parms[i].m_sparm && m_parms[j].m_sparm && // strcmp ( m_parms[i].m_scmd, m_parms[j].m_scmd) != 0 ) // continue; if ( strcmp ( m_parms[i].m_cgi , m_parms[j].m_cgi ) != 0 && // ensure cgi hashes are different as well! m_parms[i].m_cgiHash != m_parms[j].m_cgiHash ) continue; // upload file buttons are always dup of another parm if ( m_parms[j].m_type == TYPE_FILEUPLOADBUTTON ) continue; log(LOG_LOGIC,"conf: Cgi parm for #%"INT32" \"%s\" " "matches #%"INT32" \"%s\". Exiting.", i,m_parms[i].m_cgi,j,m_parms[j].m_cgi); exit(-1); } } int32_t mm = (int32_t)sizeof(CollectionRec); if ( (int32_t)sizeof(Conf) > mm ) mm = (int32_t)sizeof(Conf); if ( (int32_t)sizeof(SearchInput) > mm ) mm = (int32_t)sizeof(SearchInput); // . set size of each parm based on its type // . also do page and obj inheritance // . also do sanity checking for ( int32_t i = 0 ; i < m_numParms ; i++ ) { // sanity check if ( m_parms[i].m_off > mm || //m_parms[i].m_soff > mm || m_parms[i].m_smaxc > mm ) { log(LOG_LOGIC,"conf: Bad offset in parm #%"INT32" %s." " (%"INT32",%"INT32",%"INT32"). Did you FORGET to include " "an & before the cr.myVariable when setting " "m_off for this parm? Or subtract 'x' instead " "of 'g' or vice versa.", i,m_parms[i].m_title, mm, m_parms[i].m_off, //m_parms[i].m_soff, m_parms[i].m_smaxc); exit(-1); } // do not allow numbers in cgi parms, they are used for // denoting array indices int32_t j = 0; for ( ; m_parms[i].m_cgi && m_parms[i].m_cgi[j] ; j++ ) { if ( is_digit ( m_parms[i].m_cgi[j] ) ) { log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has " "number in cgi name.", i,m_parms[i].m_title); exit(-1); } } // these inheriting cause too many problems when moving // parms around in the array // inherit page //if ( i > 0 && m_parms[i].m_page == -1 ) // m_parms[i].m_page = m_parms[i-1].m_page; // inherit obj //if ( i > 0 && m_parms[i].m_obj == -1 ) // m_parms[i].m_obj = m_parms[i-1].m_obj; // sanity now if ( m_parms[i].m_page == -1 ) { log("parms: bad page \"%s\"",m_parms[i].m_title); char *xx=NULL;*xx=0; } if ( m_parms[i].m_obj == -1 ) { log("parms: bad obj \"%s\"",m_parms[i].m_title); char *xx=NULL;*xx=0; } // if its a fixed size then make sure m_size is not set if ( m_parms[i].m_fixed > 0 ) { if ( m_parms[i].m_size != 0 ) { log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" is " "fixed but size is not 0.", i,m_parms[i].m_title); exit(-1); } } // string sizes should already be set! size = 0; t = m_parms[i].m_type; if ( t == -1 ) { log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no type.", i,m_parms[i].m_title); exit(-1); } // skip if already set if ( m_parms[i].m_size ) goto skipSize; if ( t == TYPE_CHAR ) size = 1; if ( t == TYPE_CHAR2 ) size = 1; if ( t == TYPE_BOOL ) size = 1; if ( t == TYPE_BOOL2 ) size = 1; if ( t == TYPE_CHECKBOX ) size = 1; if ( t == TYPE_PRIORITY ) size = 1; if ( t == TYPE_PRIORITY2 ) size = 1; //if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1; if ( t == TYPE_UFP ) size = 1; if ( t == TYPE_PRIORITY_BOXES ) size = 1; if ( t == TYPE_RETRIES ) size = 1; if ( t == TYPE_TIME ) size = 6; if ( t == TYPE_DATE2 ) size = 4; if ( t == TYPE_DATE ) size = 4; if ( t == TYPE_FLOAT ) size = 4; if ( t == TYPE_DOUBLE ) size = 8; if ( t == TYPE_IP ) size = 4; if ( t == TYPE_RULESET ) size = 4; if ( t == TYPE_LONG ) size = 4; if ( t == TYPE_LONG_CONST ) size = 4; if ( t == TYPE_LONG_LONG ) size = 8; if ( t == TYPE_STRING ) size = m_parms[i].m_size; if ( t == TYPE_STRINGBOX ) size = m_parms[i].m_size; if ( t == TYPE_STRINGNONEMPTY ) size = m_parms[i].m_size; if ( t == TYPE_SITERULE ) size = 4; // comments and commands do not control underlying variables if ( size == 0 && t != TYPE_COMMENT && t != TYPE_CMD && t != TYPE_SAFEBUF && t != TYPE_FILEUPLOADBUTTON && t != TYPE_CONSTANT && t != TYPE_CHARPTR && t != TYPE_MONOD2 && t != TYPE_MONOM2 ) { log(LOG_LOGIC,"conf: Size of parm #%"INT32" \"%s\" " "not set.", i,m_parms[i].m_title); exit(-1); } m_parms[i].m_size = size; skipSize: // check offset if ( m_parms[i].m_obj == OBJ_NONE ) continue; if ( t == TYPE_COMMENT ) continue; if ( t == TYPE_FILEUPLOADBUTTON ) continue; if ( t == TYPE_CMD ) continue; if ( t == TYPE_CONSTANT ) continue; if ( t == TYPE_MONOD2 ) continue; if ( t == TYPE_MONOM2 ) continue; if ( t == TYPE_SAFEBUF ) continue; // search parms do not need an offset if ( m_parms[i].m_off == -1 ){//&& m_parms[i].m_sparm == 0 ) { log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no offset.", i,m_parms[i].m_title); exit(-1); } if ( m_parms[i].m_off < -1 ) { log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has bad offset " "of %"INT32".", i,m_parms[i].m_title,m_parms[i].m_off); exit(-1); } if ( m->m_obj == OBJ_CONF && m->m_off >= (int32_t)sizeof(Conf) ) { log("admin: Parm %s has bad m_off value.",m->m_title); char *xx = NULL; *xx = 0; } if (m->m_obj==OBJ_COLL&&m->m_off>=(int32_t)sizeof(CollectionRec)){ log("admin: Parm %s has bad m_off value.",m->m_title); char *xx = NULL; *xx = 0; } if ( m->m_off >= 0 && m->m_obj == OBJ_SI && m->m_off >= (int32_t)sizeof(SearchInput)){ log("admin: Parm %s has bad m_off value.",m->m_title); char *xx = NULL; *xx = 0; } if ( m_parms[i].m_page == -1 ) { log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no page.", i,m_parms[i].m_title); exit(-1); } if ( m_parms[i].m_obj == -1 ) { log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no object.", i,m_parms[i].m_title); exit(-1); } //if ( ! m_parms[i].m_title[0] ) { // log(LOG_LOGIC,"conf: Parm #%"INT32" \"%s\" has no title.", // i,m_parms[i].m_cgi); // exit(-1); //} // continue if already have the xml name if ( m_parms[i].m_xml ) continue; // set xml based on title char *tt = m_parms[i].m_title; if ( p + gbstrlen(tt) >= pend ) { log(LOG_LOGIC,"conf: Not enough room to store xml " "tag name in buffer."); exit(-1); } m_parms[i].m_xml = p; for ( int32_t k = 0 ; tt[k] ; k++ ) { if ( ! is_alnum_a(tt[k]) ) continue; if ( k > 0 && tt[k-1]==' ') *p++ = to_upper_a(tt[k]); else *p++ = tt[k]; } *p++ = '\0'; } // set m_searchParms int32_t n = 0; for ( int32_t i = 0 ; i < m_numParms ; i++ ) { //if ( ! m_parms[i].m_sparm ) continue; if ( m_parms[i].m_obj != OBJ_SI ) continue; m_searchParms[n++] = &m_parms[i]; // sanity check if ( m_parms[i].m_off == -1 ) { log(LOG_LOGIC,"conf: SEARCH Parm #%"INT32" \"%s\" has " "m_off < 0 (offset into SearchInput).", i,m_parms[i].m_title); exit(-1); } } m_numSearchParms = n; // . sanity check // . we should have it all covered! si.test(); // // parm overlap detector // // . fill in each parm's buffer with byte #b // . inc b for each parm #ifndef _VALGRIND_ overlapTest(+1); overlapTest(-1); #endif } void Parms::overlapTest ( char step ) { int32_t start = 0; if ( step == -1 ) start = m_numParms - 1; //log("conf: Using step=%"INT32"",(int32_t)step); SearchInput tmpsi; GigablastRequest tmpgr; InjectionRequest tmpir; CollectionRec tmpcr; Conf tmpconf; char b; char *p1 , *p2; int32_t i; // sanity check: ensure parms do not overlap for ( i = start ; i < m_numParms && i >= 0 ; i += step ) { // skip comments if ( m_parms[i].m_type == TYPE_COMMENT ) continue; if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue; // skip if it is a broadcast switch, like "all spiders on" // because that modifies another parm, "spidering enabled" if ( m_parms[i].m_type == TYPE_BOOL2 ) continue; if ( m_parms[i].m_type == TYPE_SAFEBUF ) continue; // we use cr->m_spideringEnabled for PAGE_BASIC_SETTINGS too! if ( m_parms[i].m_flags & PF_DUP ) continue; p1 = NULL; if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr; if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf; if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi; if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr; if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir; if ( p1 ) p1 += m_parms[i].m_off; p2 = NULL; int32_t size = m_parms[i].m_size; // use i now b = (char)i; // string box type is a pointer!! if ( p1 ) memset ( p1 , b , size ); //log("conf: setting %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx", // size,m_parms[i].m_title,(int32_t)p1,b); // search input uses character ptrs!! if ( m_parms[i].m_type == TYPE_STRINGBOX ) size = 4; if ( m_parms[i].m_type == TYPE_STRING ) size = 4; if ( m_parms[i].m_fixed > 0 ) size *= m_parms[i].m_fixed ; if ( p2 ) memset ( p2 , b , size ); //log("conf: setting %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx " // "i=%"INT32"", size,m_parms[i].m_title,(int32_t)p2,b,i); } // // now make sure they are the same // if ( step == -1 ) b--; else b = 0; char *objStr = "none"; int32_t obj; char infringerB; int32_t j; int32_t savedi = -1; for ( i = 0 ; i < m_numParms ; i++ ) { // skip comments if ( m_parms[i].m_type == TYPE_COMMENT ) continue; if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue; // skip if it is a broadcast switch, like "all spiders on" // because that modifies another parm, "spidering enabled" if ( m_parms[i].m_type == TYPE_BOOL2 ) continue; if ( m_parms[i].m_type == TYPE_SAFEBUF ) continue; // we use cr->m_spideringEnabled for PAGE_BASIC_SETTINGS too! if ( m_parms[i].m_flags & PF_DUP ) continue; p1 = NULL; if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr; if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf; if ( m_parms[i].m_obj == OBJ_SI ) p1 = (char *)&tmpsi; if ( m_parms[i].m_obj == OBJ_GBREQUEST ) p1 = (char *)&tmpgr; if ( m_parms[i].m_obj == OBJ_IR ) p1 = (char *)&tmpir; if ( p1 ) p1 += m_parms[i].m_off; p2 = NULL; int32_t size = m_parms[i].m_size; b = (char) i; // save it obj = m_parms[i].m_obj; //log("conf: testing %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx " // "i=%"INT32"", size,m_parms[i].m_title,(int32_t)p1,b,i); for ( j = 0 ; p1 && j < size ; j++ ) { if ( p1[j] == b ) continue; // this has multiple parms pointing to it! //if ( m_parms[i].m_type == TYPE_BOOL2 ) continue; // or special cases... //if ( p1 == (char *)&tmpconf.m_spideringEnabled ) // continue; // set object type objStr = "??????"; if ( m_parms[i].m_obj == OBJ_COLL ) objStr = "CollectionRec.h"; if ( m_parms[i].m_obj == OBJ_CONF ) objStr = "Conf.h"; if ( m_parms[i].m_obj == OBJ_SI ) objStr = "SearchInput.h"; if ( m_parms[i].m_obj == OBJ_GBREQUEST ) objStr = "GigablastRequest/Parms.h"; if ( m_parms[i].m_obj == OBJ_IR ) objStr = "InjectionRequest/PageInject.h"; // save it infringerB = p1[j]; savedi = i; goto error; } // search input uses character ptrs!! if ( m_parms[i].m_type == TYPE_STRINGBOX ) size = 4; if ( m_parms[i].m_type == TYPE_STRING ) size = 4; if ( m_parms[i].m_fixed > 0 ) size *= m_parms[i].m_fixed ; objStr = "SearchInput.h"; //log("conf: testing %"INT32" bytes for %s at 0x%"XINT32" char=0x%hhx " // "i=%"INT32"", size,m_parms[i].m_title,(int32_t)p2,b,i); for ( j = 0 ; p2 && j < size ; j++ ) { if ( p2[j] == b ) continue; // save it infringerB = p2[j]; savedi = i; log("conf: got b=0x%hhx when it should have been " "b=0x%hhx",p2[j],b); goto error; } } return; error: log("conf: Had a parm value collision. Parm #%"INT32" " "\"%s\" (size=%"INT32") in %s has overlapped with another parm. " "Your TYPE_* for this parm or a neighbor of it " "does not agree with what you have declared it as in the *.h " "file.",i,m_parms[i].m_title,m_parms[i].m_size,objStr); if ( step == -1 ) b--; else b = 0; // show possible parms that could have overwritten it! for ( i = start ; i < m_numParms && i >= 0 ; i += step ) { //char *p1 = NULL; //if ( m_parms[i].m_obj == OBJ_COLL ) p1 = (char *)&tmpcr; //if ( m_parms[i].m_obj == OBJ_CONF ) p1 = (char *)&tmpconf; // skip if comment if ( m_parms[i].m_type == TYPE_COMMENT ) continue; if ( m_parms[i].m_type == TYPE_FILEUPLOADBUTTON ) continue; if ( m_parms[i].m_flags & PF_DUP ) continue; if ( m_parms[i].m_obj != m_parms[savedi].m_obj ) continue; // skip if no match //bool match = false; //if ( m_parms[i].m_obj == obj ) match = true; //if ( m_parms[i].m_sparm && // NOTE: these need to be fixed!!! b = (char) i; if ( b == infringerB ) log("conf: possible overlap with parm #%"INT32" in %s " "\"%s\" (size=%"INT32") " "xml=%s " "desc=\"%s\"", i,objStr,m_parms[i].m_title, m_parms[i].m_size, m_parms[i].m_xml, m_parms[i].m_desc); } log("conf: try including \"m->m_obj = OBJ_COLL;\" or " "\"m->m_obj = OBJ_CONF;\" in your parm definitions"); log("conf: failed overlap test. exiting."); exit(-1); } bool Parm::getValueAsBool ( SearchInput *si ) { if ( m_obj != OBJ_SI ) { char *xx=NULL;*xx=0; } char *p = (char *)si + m_off; return *(bool *)p; } int32_t Parm::getValueAsLong ( SearchInput *si ) { if ( m_obj != OBJ_SI ) { char *xx=NULL;*xx=0; } char *p = (char *)si + m_off; return *(int32_t *)p; } char *Parm::getValueAsString ( SearchInput *si ) { if ( m_obj != OBJ_SI ) { char *xx=NULL;*xx=0; } char *p = (char *)si + m_off; return *(char **)p; } ///////// // // new functions // ///////// bool Parms::addNewParmToList1 ( SafeBuf *parmList , collnum_t collnum , char *parmValString , int32_t occNum , char *parmName ) { // get the parm descriptor Parm *m = getParmFast1 ( parmName , NULL ); if ( ! m ) return log("parms: got bogus parm2 %s",parmName ); return addNewParmToList2 ( parmList,collnum,parmValString,occNum,m ); } // . make a parm rec using the prodivded string // . used to convert http requests into a parmlist // . string could be a float or int32_t or int64_t in ascii, as well as a string // . returns false w/ g_errno set on error bool Parms::addNewParmToList2 ( SafeBuf *parmList , collnum_t collnum , char *parmValString , int32_t occNum , Parm *m ) { // get value char *val = NULL; int32_t valSize = 0; //char buf[2+MAX_COLL_LEN]; int32_t val32; int64_t val64; char val8; float valf; /* char *obj = NULL; // we might be adding a collnum if a collection that is being // added via the CommandAddColl0() "addColl" or "addCrawl" or // "addBulk" commands. they will reserve the collnum, so it might // not be ready yet. if ( collnum != -1 ) { CollectionRec *cr = g_collectiondb.getRec ( collnum ); if ( cr ) obj = (char *)cr; // log("parms: no coll rec for %"INT32"",(int32_t)collnum); // return false; //} //obj = (char *)cr; } else { obj = (char *)&g_conf; } */ if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_SAFEBUF || m->m_type == TYPE_STRINGNONEMPTY ) { // point to string //val = obj + m->m_off; // Parm::m_size is the max string size //if ( occNum > 0 ) val += occNum * m->m_size; // stringlength + 1. no just make it the whole string in // case it does not use the \0 protocol //valSize = m->m_max; val = parmValString; // include \0 valSize = gbstrlen(val)+1; // sanity if ( val[valSize-1] != '\0' ) { char *xx=NULL;*xx=0; } } else if ( m->m_type == TYPE_LONG ) { // watch out for unsigned 32-bit numbers, so use atoLL() val64 = atoll(parmValString); val = (char *)&val64; valSize = 4; } else if ( m->m_type == TYPE_FLOAT ) { valf = atof(parmValString); val = (char *)&valf; valSize = 4; } else if ( m->m_type == TYPE_LONG_LONG ) { val64 = atoll(parmValString); val = (char *)&val64; valSize = 8; } else if ( m->m_type == TYPE_BOOL || m->m_type == TYPE_BOOL2 || m->m_type == TYPE_CHECKBOX || m->m_type == TYPE_PRIORITY2 || m->m_type == TYPE_UFP || m->m_type == TYPE_CHAR ) { val8 = atol(parmValString); //if ( parmValString && to_lower_a(parmValString[0]) == 'y' ) // val8 = 1; //if ( parmValString && to_lower_a(parmValString[0]) == 'n' ) // val8 = 0; val = (char *)&val8; valSize = 1; } // for resetting or restarting a coll i think the ascii arg is // the NEW reserved collnum, but for other commands then parmValString // will be NULL else if ( m->m_type == TYPE_CMD ) { val = parmValString; if ( val ) valSize = gbstrlen(val)+1; // . addcoll collection can not be too long // . TODO: supply a Parm::m_checkValFunc to ensure val is // legitimate, and set g_errno on error if ( strcmp(m->m_cgi,"addcoll") == 0 &&valSize-1>MAX_COLL_LEN){ log("admin: addcoll coll too long"); g_errno = ECOLLTOOBIG; return false; } // scan for holes if we hit the limit //if ( g_collectiondb.m_numRecs >= 1LL>>sizeof(collnum_t) ) } else if ( m->m_type == TYPE_IP ) { // point to string //val = obj + m->m_off; // Parm::m_size is the max string size //if ( occNum > 0 ) val += occNum * m->m_size; // stringlength + 1. no just make it the whole string in // case it does not use the \0 protocol val32 = atoip(parmValString); // store ip in binary format val = (char *)&val32; valSize = 4; } else { log("parms: shit unsupported parm type"); char *xx=NULL;*xx=0; } key96_t key = makeParmKey ( collnum , m , occNum ); // then key if ( ! parmList->safeMemcpy ( &key , sizeof(key) ) ) return false; // datasize if ( ! parmList->pushLong ( valSize ) ) return false; // and data if ( val && valSize && ! parmList->safeMemcpy ( val , valSize ) ) return false; return true; } // g_parms.addCurrentParmToList1 ( &parmList , cr , "spiderRoundNum" ); bool Parms::addCurrentParmToList1 ( SafeBuf *parmList , CollectionRec *cr , char *parmName ) { collnum_t collnum = -1; if ( cr ) collnum = cr->m_collnum; // get the parm descriptor int32_t occNum; Parm *m = getParmFast1 ( parmName , &occNum ); if ( ! m ) return log("parms: got bogus parm1 %s",parmName ); return addCurrentParmToList2 ( parmList , collnum, -1 , m ); } // . use the current value of the parm to make this record // . parm class itself already helps us reference the binary parm value bool Parms::addCurrentParmToList2 ( SafeBuf *parmList , collnum_t collnum , int32_t occNum , Parm *m ) { char *obj = NULL; if ( collnum != -1 ) { CollectionRec *cr = g_collectiondb.getRec ( collnum ); if ( ! cr ) return false; obj = (char *)cr; } else { obj = (char *)&g_conf; } char *data = obj + m->m_off; // Parm::m_size is the max string size int32_t dataSize = m->m_size; if ( occNum > 0 ) data += occNum * m->m_size; if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX || m->m_type == TYPE_SAFEBUF || m->m_type == TYPE_STRINGNONEMPTY ) // include \0 in string dataSize = gbstrlen(data) + 1; // if a safebuf, point to the string within if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *sb = (SafeBuf *)data; data = sb->getBufStart(); dataSize = sb->length(); // sanity if ( dataSize > 0 && !data[dataSize-1]){char *xx=NULL;*xx=0;} // include the \0 since we do it for strings above if ( dataSize > 0 ) dataSize++; // empty? make it \0 then to be like strings i guess if ( dataSize == 0 ) { data = "\0"; dataSize = 1; } // sanity check if ( dataSize > 0 && data[dataSize-1] ) {char *xx=NULL;*xx=0;} // if just a \0 then make it empty //if ( dataSize && !data[0] ) { // data = NULL; // dataSize = 0; //} } //int32_t occNum = -1; key96_t key = makeParmKey ( collnum , m , occNum ); /* // debug it log("parms: adding parm collnum=%i title=%s " "key=%s datasize=%i data=%s hash=%"UINT32 ,(int)collnum ,m->m_title ,KEYSTR(&key,sizeof(key)) ,(int)dataSize ,data ,(uint32_t)hash32(data,dataSize)); */ // then key if ( ! parmList->safeMemcpy ( &key , sizeof(key) ) ) return false; // size if ( ! parmList->pushLong ( dataSize ) ) return false; // and data if ( dataSize && ! parmList->safeMemcpy ( data , dataSize ) ) return false; return true; } // returns false and sets g_errno on error bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList, int32_t page , TcpSocket *sock ) { // false = useDefaultRec? CollectionRec *cr = g_collectiondb.getRec ( hr , false ); //if ( c ) { // cr = g_collectiondb.getRec ( hr ); // if ( ! cr ) log("parms: coll not found"); //} bool isMasterAdmin = g_conf.isMasterAdmin ( sock , hr ); // does this user have permission to update the parms? bool isCollAdmin = g_conf.isCollAdmin ( sock , hr ) ; // might be g_conf specific, not coll specific //bool hasPerm = false; // just knowing the collection name of a custom crawl means you // know the token, so you have permission //if ( cr && cr->m_isCustomCrawl ) hasPerm = true; //if ( hr->isLocal() ) hasPerm = true; // fix jenkins "GET /v2/crawl?token=crawlbottesting" request char *name = hr->getString("name"); char *token = hr->getString("token"); //if ( ! cr && token ) hasPerm = true; //if ( ! hasPerm ) { // //log("parms: no permission to set parms"); // //g_errno = ENOPERM; // //return false; // // just leave the parm list empty and fail silently // return true; //} // we set the parms in this collnum collnum_t parmCollnum = -1; if ( cr ) parmCollnum = cr->m_collnum; // turn the collnum into an ascii string for providing as args // when &reset=1 &restart=1 &delete=1 is given along with a // &c= or a &name=/&token= pair. char oldCollName[MAX_COLL_LEN+1]; oldCollName[0] = '\0'; if ( cr ) sprintf(oldCollName,"%"INT32"",(int32_t)cr->m_collnum); //////// // // HACK: if crawlbot user supplies a token, name, and seeds, and the // corresponding collection does not exist then assume it is an add // //////// char customCrawl = 0; char *path = hr->getPath(); // i think /crawlbot is only used by me to see PageCrawlBot.cpp // so don't bother... if ( strncmp(path,"/crawlbot",9) == 0 ) customCrawl = 0; if ( strncmp(path,"/v2/crawl",9) == 0 ) customCrawl = 1; if ( strncmp(path,"/v2/bulk" ,8) == 0 ) customCrawl = 2; if ( strncmp(path,"/v3/crawl",9) == 0 ) customCrawl = 1; if ( strncmp(path,"/v3/bulk" ,8) == 0 ) customCrawl = 2; // throw error if collection record custom crawl type doesn't equal // the crawl type of current request if (cr && customCrawl && customCrawl != cr->m_isCustomCrawl ) { g_errno = ECUSTOMCRAWLMISMATCH; return false; } bool hasAddCrawl = hr->hasField("addCrawl"); bool hasAddBulk = hr->hasField("addBulk"); bool hasAddColl = hr->hasField("addColl"); // sometimes they try to delete a collection that is not there so do // not apply this logic in that case! bool hasDelete = hr->hasField("delete"); bool hasRestart = hr->hasField("restart"); bool hasReset = hr->hasField("reset"); bool hasSeeds = hr->hasField("seeds"); // check for bulk jobs as well if ( ! hasSeeds ) hasSeeds = hr->hasField("urls"); if ( ! cr && token && name && customCrawl && hasSeeds && ! hasDelete && ! hasRestart && ! hasReset && ! hasAddCrawl && ! hasAddBulk && ! hasAddColl ) { // reserve a new collnum for adding this crawl parmCollnum = g_collectiondb.reserveCollNum(); // must be there! if ( parmCollnum == -1 ) { g_errno = EBADENGINEER; return false; } // log it for now log("parms: trying to add custom crawl (%"INT32")", (int32_t)parmCollnum); // formulate name char newName[MAX_COLL_LEN+1]; snprintf(newName,MAX_COLL_LEN,"%s-%s",token,name); char *cmdStr = "addCrawl"; if ( customCrawl == 2 ) cmdStr = "addBulk"; // add to parm list if ( ! addNewParmToList1 ( parmList , parmCollnum , newName , -1 , // occNum cmdStr ) ) return false; } // loop through cgi parms for ( int32_t i = 0 ; i < hr->getNumFields() ; i++ ) { // get cgi parm name char *field = hr->getField ( i ); // get value of the cgi field char *val = hr->getValue (i); // convert field to parm int32_t occNum; // parm names can be shared across pages, like "c" // for search, addurl, inject, etc. Parm *m = getParmFast1 ( field , &occNum ); if ( ! m ) continue; // skip if not a command parm, like "addcoll" if ( m->m_type != TYPE_CMD ) continue; if ( m->m_obj != OBJ_CONF && m->m_obj != OBJ_COLL ) continue; // // HACK // // if its a resetcoll/restartcoll/addcoll we have to // get the next available collnum and use that for setting // any additional parms. that is the coll it will act on. if ( strcmp(m->m_cgi,"addColl") == 0 || // lowercase support. camelcase is obsolete. strcmp(m->m_cgi,"addcoll") == 0 || strcmp(m->m_cgi,"addCrawl") == 0 || strcmp(m->m_cgi,"addBulk" ) == 0 || strcmp(m->m_cgi,"reset" ) == 0 || strcmp(m->m_cgi,"restart" ) == 0 ) { // if we wanted to we could make the data the // new parmCollnum since we already store the old // collnum in the parm rec key parmCollnum = g_collectiondb.reserveCollNum(); // // // NOTE: the old collnum is in the "val" already // like "&reset=462" or "&addColl=test" // // // sanity. if all are full! we hit our limit of // 32k collections. should increase collnum_t from // int16_t to int32_t... if ( parmCollnum == -1 ) { g_errno = EBADENGINEER; return false; } } // . DIFFBOT HACK: so ppl can manually restart a spider round // . val can be 0 or 1 or anything. i.e. roundStart=0 works. // . map this parm to another parm with the round start // time (current time) and the new round # as the args. // . this will call CommandForceNextSpiderRound() function // on every shard with these args, "tmpVal". if ( cr && strcmp(m->m_cgi,"roundStart") == 0 ) { // use the current time so anything spidered before // this time (the round start time) will be respidered //sprintf(tmp,"%"UINT32"",getTimeGlobalNoCore()); //val = tmp; char tmpVal[64]; // use the same round start time for all shards sprintf(tmpVal, "%"UINT32",%"INT32"" ,(uint32_t)getTimeGlobalNoCore() ,cr->m_spiderRoundNum+1 ); // . also add command to reset crawl/process counts // so if you hit maxToProcess/maxToCrawl it will // not stop the round from restarting // . CommandResetCrawlCounts() if ( ! addNewParmToList1 ( parmList , parmCollnum , tmpVal, // a string 0 , // occNum (for arrays) "forceround" ) ) return false; // don't bother going below continue; } // if a collection name was also provided, assume that is // the target of the reset/delete/restart. we still // need PageAddDelete.cpp to work... if ( cr && ( strcmp(m->m_cgi,"reset" ) == 0 || strcmp(m->m_cgi,"delete" ) == 0 || strcmp(m->m_cgi,"restart" ) == 0 ) ) // the collnum to reset/restart/del // given as a string. val = oldCollName; // // CLOUD SEARCH ENGINE SUPPORT // // // if this is the "delcoll" parm then "c" may have been // excluded from http request, therefore isCollAdmin and // isMasterAdmin may be false, so see if they have permission // for the "val" collection for this one... bool hasPerm = false; if ( m->m_page == PAGE_DELCOLL && strcmp(m->m_cgi,"delcoll") == 0 ) { // permission override for /admin/delcoll cmd & parm hasPerm = g_conf.isCollAdminForColl (sock,hr,val); } // if this IP c-block as already added a collection then do not // allow it to add another. if ( m->m_page == PAGE_ADDCOLL && g_conf.m_allowCloudUsers && ! isMasterAdmin && strcmp(m->m_cgi,"addcoll")==0 ) { // see if user's c block has already added a collection int32_t numAdded = 0; if ( numAdded >= 1 ) { g_errno = ENOPERM; log("parms: already added a collection from " "this cloud user's c-block."); return false; } hasPerm = true; } // master controls require root permission if ( m->m_obj == OBJ_CONF && ! isMasterAdmin ) { log("parms: could not run root parm \"%s\" no perm.", m->m_title); continue; } // need to have permission for collection for collrec parms if ( m->m_obj == OBJ_COLL && ! isCollAdmin && ! hasPerm ) { log("parms: could not run coll parm \"%s\" no perm.", m->m_title); continue; } // add the cmd parm if ( ! addNewParmToList2 ( parmList , // it might be a collection-less // command like 'gb stop' which // uses the "save=1" parm. // this is the "new" collnum to // create in the case of // add/reset/restart, but in the // case of delete it is -1 or old. parmCollnum , // the argument to the function... // in the case of delete, the // collnum to delete in ascii. // in the case of add, the name // of the new coll. in the case // of reset/restart the OLD // collnum is ascii to delete. val, occNum , m ) ) return false; } // if we are one page url filters, turn off all checkboxes! // html should really transmit them as =0 if they are unchecked!! // "fe" is a url filter expression for the first row. //if ( hr->hasField("fe") && page == PAGE_FILTERS && cr ) { // for ( int32_t i = 0 ; i < cr->m_numRegExs ; i++ ) { // //cr->m_harvestLinks [i] = 0; // //cr->m_spidersEnabled[i] = 0; // if ( ! addNewParmToList2 ( parmList , // cr->m_collnum, // "0", // i, // } //} // // CLOUD SEARCH ENGINE SUPPORT // // provide userip so when adding a new collection we can // store it in the collection rec to ensure that the same // IP address cannot add more than one collection. // if ( sock && page == PAGE_ADDCOLL ) { char *ipStr = iptoa(sock->m_ip); int32_t occNum; Parm *um = getParmFast1 ( "userip" , &occNum); // NULL = occNum if ( ! addNewParmToList2 ( parmList , // HACK! operate on the to-be-added // collrec, if there was an addcoll // reset or restart coll cmd... parmCollnum , ipStr, // val , occNum , um ) ) return false; } // // now add the parms that are NOT commands // // loop through cgi parms for ( int32_t i = 0 ; i < hr->getNumFields() ; i++ ) { // get cgi parm name char *field = hr->getField ( i ); // get value of the cgi field char *val = hr->getValue (i); // get the occurrence # if its regex. this is the row # // in the url filters table, since those parms repeat names. // url filter expression. //if ( strcmp(field,"fe") == 0 ) occNum++; // convert field to parm int32_t occNum; Parm *m = getParmFast1 ( field , &occNum ); // // map "pause" to spidering enabled // if ( strcmp(field,"pause" ) == 0 || strcmp(field,"pauseCrawl") == 0 ) { m = getParmFast1 ( "cse", &occNum); if ( val && val[0] == '0' ) val = "1"; else if ( val && val[0] == '1' ) val = "0"; if ( ! m ) { char *xx=NULL;*xx=0; } } if ( ! m ) continue; // skip if IS a command parm, like "addcoll", we did that above if ( m->m_type == TYPE_CMD ) continue; if ( m->m_obj != OBJ_CONF && m->m_obj != OBJ_COLL ) continue; // // CLOUD SEARCH ENGINE SUPPORT // // master controls require root permission. otherwise, just // knowing the collection name is enough for a cloud user // to change settings. // bool hasPerm = false; // master controls require root permission if ( m->m_obj == OBJ_CONF && ! isMasterAdmin ) { log("parms: could not set root parm \"%s\" no perm.", m->m_title); continue; } // need to have permission for collection for collrec parms if ( m->m_obj == OBJ_COLL && ! isCollAdmin && ! hasPerm ) { log("parms: could not set coll parm \"%s\" no perm.", m->m_title); continue; } // convert spiderRoundStartTime=0 (roundStart=0 roundStart=1) // to spiderRoundStartTime=+30secs // so that will force the next spider round to kick in /* bool restartRound = false; char tmp[24]; if ( strcmp(field,"roundStart")==0 && val && (val[0]=='0'||val[0]=='1') && val[1]==0 ) sprintf(tmp,"%"UINT32"",(int32_t)getTimeGlobalNoCore()+0); val = tmp; } */ // add it to a list now if ( ! addNewParmToList2 ( parmList , // HACK! operate on the to-be-added // collrec, if there was an addcoll // reset or restart coll cmd... parmCollnum , val , occNum , m ) ) return false; } return true; } Parm *Parms::getParmFast2 ( int32_t cgiHash32 ) { static HashTableX s_pht; static char s_phtBuf[26700]; static bool s_init = false; if ( ! s_init ) { // init hashtable s_pht.set ( 4,sizeof(char *),2048,s_phtBuf,26700, false,0,"phttab" ); // reduce hash collisions: s_pht.m_useKeyMagic = true; // wtf? if ( m_numParms <= 0 ) init(); if ( m_numParms <= 0 ) { char *xx=NULL;*xx=0; } // fill up hashtable for ( int32_t i = 0 ; i < m_numParms ; i++ ) { // get it Parm *parm = &m_parms[i]; // skip parms that are not for conf or coll lest // it bitch that "c" is duplicated... if ( parm->m_obj != OBJ_CONF && parm->m_obj != OBJ_COLL ) continue; // skip comments if ( parm->m_type == TYPE_COMMENT ) continue; if ( parm->m_type == TYPE_FILEUPLOADBUTTON ) continue; // skip if no cgi if ( ! parm->m_cgi ) continue; // get its hash of its cgi int32_t ph32 = parm->m_cgiHash; // sanity! if ( s_pht.isInTable ( &ph32 ) ) { // get the dup guy Parm *dup = *(Parm **)s_pht.getValue(&ph32); // same underlying parm? // like for "all spiders on" vs. // "all spiders off"? if ( dup->m_off == parm->m_off ) continue; // otherwise bitch about it and drop core log("parms: dup parm h32=%"INT32" " "\"%s\" vs \"%s\"", ph32, dup->m_title,parm->m_title); char *xx=NULL;*xx=0; } // add that to hash table s_pht.addKey ( &ph32 , &parm ); } // do not do this again s_init = true; } Parm **pp = (Parm **)s_pht.getValue ( &cgiHash32 ); if ( ! pp ) return NULL; return *pp; } Parm *Parms::getParmFast1 ( char *cgi , int32_t *occNum ) { // strip off the %"INT32" for things like 'fe3' for example // because that is the occurrence # for parm arrays. int32_t clen = gbstrlen(cgi); char *d = NULL; if ( clen > 1 ) { d = cgi + clen - 1; while ( is_digit(*d) ) d--; d++; } int32_t h32; // assume not an array if ( occNum ) *occNum = -1; if ( d && *d ) { if ( occNum ) *occNum = atol(d); h32 = hash32 ( cgi , d - cgi ); } else h32 = hash32n ( cgi ); Parm *m = getParmFast2 ( h32 ); if ( ! m ) return NULL; // the first element does not have a number after it if ( m->isArray() && occNum && *occNum == -1 ) *occNum = 0; return m; } //////////// // // functions for distributing/syncing parms to/with all hosts // //////////// class ParmNode { public: SafeBuf m_parmList; int32_t m_numRequests; int32_t m_numReplies; int32_t m_numGoodReplies; int32_t m_numHostsTotal; class ParmNode *m_prevNode; class ParmNode *m_nextNode; int64_t m_parmId; bool m_calledCallback; int32_t m_startTime; void *m_state; void (* m_callback)(void *state); bool m_sendToGrunts; bool m_sendToProxies; int32_t m_hostId; // -1 means send parm update to all hosts // . if not -1 then [m_hostId,m_hostId2] is a range // . used by main.cpp cmd line cmds like 'gb stop 3-5' int32_t m_hostId2; }; static ParmNode *s_headNode = NULL; static ParmNode *s_tailNode = NULL; static int64_t s_parmId = 0LL; // . will send the parm update request to each host and retry forever, // until dead hosts come back up // . keeps parm update requests in order received // . returns true and sets g_errno on error // . returns false if blocked and will call your callback bool Parms::broadcastParmList ( SafeBuf *parmList , void *state , void (* callback)(void *) , bool sendToGrunts , bool sendToProxies , // this is -1 if sending to all hosts int32_t hostId , // this is not -1 if its range [hostId,hostId2] int32_t hostId2 ) { // empty list? if ( parmList->getLength() <= 0 ) return true; // only us? no need for this then. we now do this... //if ( g_hostdb.m_numHosts <= 1 ) return true; // make a new parm transmit node ParmNode *pn = (ParmNode *)mmalloc ( sizeof(ParmNode) , "parmnode" ); if ( ! pn ) return true; pn->m_parmList.constructor(); // update the ticket #. we use this to keep things ordered too. // this should never be zero since it starts off at zero. s_parmId++; // set it pn->m_parmList.stealBuf ( parmList ); pn->m_numRequests = 0; pn->m_numReplies = 0; pn->m_numGoodReplies = 0; pn->m_numHostsTotal = 0; pn->m_prevNode = NULL; pn->m_nextNode = NULL; pn->m_parmId = s_parmId; // take a ticket pn->m_calledCallback = false; pn->m_startTime = getTimeLocal(); pn->m_state = state; pn->m_callback = callback; pn->m_sendToGrunts = sendToGrunts; pn->m_sendToProxies = sendToProxies; pn->m_hostId = hostId; pn->m_hostId2 = hostId2; // a range? then not -1 here. // store it ordered in our linked list of parm transmit nodes if ( ! s_tailNode ) { s_headNode = pn; s_tailNode = pn; } else { // link pn at end of tail s_tailNode->m_nextNode = pn; pn->m_prevNode = s_tailNode; // pn becomes the new tail s_tailNode = pn; } // just the regular proxies, not compression proxies if ( pn->m_sendToProxies ) pn->m_numHostsTotal += g_hostdb.getNumProxies(); if ( pn->m_sendToGrunts ) pn->m_numHostsTotal += g_hostdb.getNumGrunts(); if ( hostId >= 0 ) pn->m_numHostsTotal = 1; // pump the parms out to other hosts in the network doParmSendingLoop ( ); // . if waiting for more replies to come in that should be in soon // . doParmSendingLoop() is called when a reply comes in so that // the next requests can be sent out //if ( waitingForLiveHostsToReply() ) return false; // all done. how did this happen? //return true; // wait for replies return false; } void tryToCallCallbacks ( ) { ParmNode *pn = s_headNode; int32_t now = getTimeLocal(); for ( ; pn ; pn = pn->m_nextNode ) { // skip if already called callback if ( pn->m_calledCallback ) continue; // should we call the callback? bool callIt = false; if ( pn->m_numReplies >= pn->m_numRequests ) callIt = true; // sometimes we don't launch any requests to update parms // because we are jammed up. same logic as we use for // freeing the pn below. if ( pn->m_numGoodReplies < pn->m_numHostsTotal ) callIt = false; // 8 seconds is enough to wait for all replies to come in. // a host might be dead, so we need this here lest the // underlying page handler (i.e. sendPageCrawlbot()) never // get called if a host is dead. if you are updating some // parms you want the page to return. if ( now - pn->m_startTime > 8 && ! callIt && g_hostdb.hasDeadHost() ) callIt = true; if ( ! callIt ) continue; // callback is NULL for updating parms like spiderRoundNum // in Spider.cpp if ( pn->m_callback ) pn->m_callback ( pn->m_state ); pn->m_calledCallback = true; } } void gotParmReplyWrapper ( void *state , UdpSlot *slot ) { // don't let upserver free the send buf! that's the ParmNode parmlist slot->m_sendBufAlloc = NULL; // in case host table is dynamically modified, go by # Host *h = g_hostdb.getHost((int32_t)(PTRTYPE)state); int32_t parmId = h->m_currentParmIdInProgress; ParmNode *pn = h->m_currentNodePtr; // inc this count pn->m_numReplies++; // nothing in progress now h->m_currentParmIdInProgress = 0; h->m_currentNodePtr = NULL; // this is usually timeout on a dead host i guess if ( g_errno ) { log("parms: got parm update reply from host #%"INT32": %s", h->m_hostId,mstrerror(g_errno)); } // . note it so we do not retry every 1ms! // . and only retry on time outs or no mem errors for now... // . it'll retry once every 10 seconds using the sleep // wrapper below if ( g_errno != EUDPTIMEDOUT && g_errno != ENOMEM ) g_errno = 0; if ( g_errno ) { // remember error info for retry h->m_lastTryError = g_errno; h->m_lastTryTime = getTimeLocal(); // if a host timed out he could be dead, so try to call // the callback for this "pn" anyway. if the only hosts we // do not have replies for are dead, then we'll call the // callback, but still keep trying to send to them. tryToCallCallbacks (); // try to send more i guess? i think this is right otherwise // the callback might not ever get called g_parms.doParmSendingLoop(); return; } // no error, otherwise h->m_lastTryError = 0; // successfully completed h->m_lastParmIdCompleted = parmId; // inc this count pn->m_numGoodReplies++; // . this will try to call any callback that can be called // . for instances, if the "pn" has recvd all the replies // . OR if the remaining hosts are "DEAD" // . the callback is in the "pn" tryToCallCallbacks (); // nuke it? if ( pn->m_numGoodReplies >= pn->m_numHostsTotal && pn->m_numReplies >= pn->m_numRequests ) { // . we must always be the head lest we send out of order. // . ParmNodes only destined to a specific hostid are ignored // for this check, only look at those whose m_hostId is -1 if(pn != s_headNode && pn->m_hostId==-1){ log("parms: got parm request out of band. not head."); } // a new head if ( pn == s_headNode ) { // sanity if ( pn->m_prevNode ) { char *xx=NULL;*xx=0; } // the guy after us is the new head s_headNode = pn->m_nextNode; } // a new tail? if ( pn == s_tailNode ) { // sanity if ( pn->m_nextNode ) { char *xx=NULL;*xx=0; } // the guy before us is the new tail s_tailNode = pn->m_prevNode; } // empty? if ( ! s_headNode ) s_tailNode = NULL; // wtf? if ( ! pn->m_calledCallback ) { char *xx=NULL;*xx=0; } // do callback first before freeing pn //if ( pn->m_callback ) pn->m_callback ( pn->m_state ); if ( pn->m_prevNode ) pn->m_prevNode->m_nextNode = pn->m_nextNode; if ( pn->m_nextNode ) pn->m_nextNode->m_prevNode = pn->m_prevNode; mfree ( pn , sizeof(ParmNode) , "pndfr"); } // try to send more for him g_parms.doParmSendingLoop(); } void parmLoop ( int fd , void *state ) { g_parms.doParmSendingLoop(); } static bool s_registeredSleep = false; static bool s_inLoop = false; // . host #0 runs this to send out parms in the the parm queue (linked list) // to all other hosts. // . he also sends to himself, if m_sendToGrunts is true bool Parms::doParmSendingLoop ( ) { if ( ! s_headNode ) return true; if ( g_isDumpingRdbFromMain ) return true; if ( s_inLoop ) return true; s_inLoop = true; if ( ! s_registeredSleep && ! g_loop.registerSleepCallback(2000,NULL,parmLoop,0) ) log("parms: failed to reg parm loop"); // do not re-register s_registeredSleep = true; int32_t now = getTimeLocal(); // try to send a parm update request to each host for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // get it Host *h = g_hostdb.getHost(i); // skip ourselves, host #0. we now send to ourselves // so updateParm() will be called on us... //if ( h->m_hostId == g_hostdb.m_myHostId ) continue; // . if in progress, gotta wait for that to complete // . 0 is not a legit parmid, it starts at 1 if ( h->m_currentParmIdInProgress ) continue; // if his last completed parmid is the current he is up-to-date if ( h->m_lastParmIdCompleted == s_parmId ) continue; // if last try had an error, wait 10 secs i guess if ( h->m_lastTryError && h->m_lastTryError != EUDPTIMEDOUT && now - h->m_lastTryTime < 10 ) continue; // otherwise get him the next to send ParmNode *pn = s_headNode; for ( ; pn ; pn = pn->m_nextNode ) { // stop when we got a parmnode we have not sent to // him yet, we'll send it now if ( pn->m_parmId > h->m_lastParmIdCompleted ) break; } // nothing? strange. something is not right. if ( ! pn ) { log("parms: pn is null"); break; char *xx=NULL; *xx=0; } // give him a free pass? some parm updates are directed to // a single host, we use this for syncing parms at startup. if ( pn->m_hostId >= 0 && pn->m_hostId2 == -1 && // not a range h->m_hostId != pn->m_hostId ) { // assume we sent it to him h->m_lastParmIdCompleted = pn->m_parmId; h->m_currentNodePtr = NULL; continue; } // range? if not in range, give free pass if ( pn->m_hostId >= 0 && pn->m_hostId2 >= 0 && ( h->m_hostId < pn->m_hostId || h->m_hostId > pn->m_hostId2 ) ) { // assume we sent it to him h->m_lastParmIdCompleted = pn->m_parmId; h->m_currentNodePtr = NULL; continue; } // force completion if we should NOT send to him if ( (h->isProxy() && ! pn->m_sendToProxies) || (h->isGrunt() && ! pn->m_sendToGrunts ) ) { h->m_lastParmIdCompleted = pn->m_parmId; h->m_currentNodePtr = NULL; continue; } // debug log log(LOG_INFO,"parms: sending parm request id %i " "to hostid %"INT32"",(int)pn->m_parmId,h->m_hostId); // count it pn->m_numRequests++; // ok, he's available if ( ! g_udpServer.sendRequest ( pn->m_parmList.getBufStart(), pn->m_parmList.length() , // a new msgtype 0x3f, h->m_ip, // ip h->m_port, // port h->m_hostId , NULL, // retslot (void *)(PTRTYPE)h->m_hostId , // state gotParmReplyWrapper , 30 , // timeout secs -1 , // backoff -1 , // maxwait NULL , // replybuf 0 , // replybufmaxsize 0 ) ) { // niceness log("parms: failed to send: %s",mstrerror(g_errno)); continue; } // flag this h->m_currentParmIdInProgress = pn->m_parmId; h->m_currentNodePtr = pn; } s_inLoop = false; return true; } void handleRequest3fLoop ( void *weArg ) ; void handleRequest3fLoop2 ( void *state , UdpSlot *slot ) { handleRequest3fLoop(state); } // if a tree is saving while we are trying to delete a collnum (or reset) // then the call to updateParm() below returns false and we must re-call // in this sleep wrapper here void handleRequest3fLoop3 ( int fd , void *state ) { g_loop.unregisterSleepCallback(state,handleRequest3fLoop3); handleRequest3fLoop(state); } // . host #0 is requesting that we update some parms void handleRequest3fLoop ( void *weArg ) { WaitEntry *we = (WaitEntry *)weArg; CollectionRec *cx = NULL; // process them char *p = we->m_parmPtr; for ( ; p < we->m_parmEnd ; ) { // int16_tcut char *rec = p; // get size int32_t dataSize = *(int32_t *)(rec+sizeof(key96_t)); int32_t recSize = sizeof(key96_t) + 4 + dataSize; // skip it p += recSize; // get the actual parm Parm *parm = getParmFromParmRec ( rec ); if ( ! parm ) { int32_t h32 = getHashFromParmRec(rec); log("parms: unknown parm sent to us hash=%"INT32"",h32); for ( int32_t i = 0 ; i < g_parms.m_numParms ; i++ ) { Parm *x = &g_parms.m_parms[i]; if ( x->m_cgiHash != h32 ) continue; log("parms: unknown parm=%s",x->m_title); break; } continue; } // if was the cmd to save & exit then first send a reply back if ( ! we->m_sentReply && parm->m_cgi && parm->m_cgi[0] == 's' && parm->m_cgi[1] == 'a' && parm->m_cgi[2] == 'v' && parm->m_cgi[3] == 'e' && parm->m_cgi[4] == '\0' ) { // do not re-do this we->m_sentReply = 1; // note it log("parms: sending early parm update reply"); // wait for reply to be sent and ack'd g_udpServer.sendReply_ass ( NULL,0, NULL,0, we->m_slot, 8, // timeout in secs // come back here when done we , handleRequest3fLoop2 ); return; } // . determine if it alters the url filters // . if those were changed we have to nuke doledb and // waiting tree in Spider.cpp and rebuild them! if ( parm->m_flags & PF_REBUILDURLFILTERS ) we->m_doRebuilds = true; if ( parm->m_flags & PF_REBUILDPROXYTABLE ) we->m_doProxyRebuild = true; if ( parm->m_flags & PF_REBUILDACTIVELIST ) we->m_rebuildActiveList = true; // get collnum i guess if ( parm->m_type != TYPE_CMD ) we->m_collnum = getCollnumFromParmRec ( rec ); // see if our spider round changes int32_t oldRound; if ( we->m_collnum >= 0 && ! cx ) { cx = g_collectiondb.getRec ( we->m_collnum ); // i guess coll might gotten deleted! so check cx if ( cx ) oldRound = cx->m_spiderRoundNum; } // . this returns false if blocked, returns true and sets // g_errno on error // . it'll block if trying to delete a coll when the tree // is saving or something (CommandDeleteColl()) if ( ! g_parms.updateParm ( rec , we ) ) { //////////// // // . it blocked! it will call we->m_callback when done // . we must re-call // . try again in 100ms // //////////// if(!g_loop.registerSleepCallback(100, we , handleRequest3fLoop3, 0 ) ){// niceness log("parms: failed to reg sleeper"); return; } log("parms: updateParm blocked. waiting."); return; } if ( cx && oldRound != cx->m_spiderRoundNum ) we->m_updatedRound = true; // do the next parm we->m_parmPtr = p; // error? if ( ! g_errno ) continue; // this could mean failed to add coll b/c out of disk or // something else that is bad we->m_errno = g_errno; } // one last thing... kinda hacky. if we change certain spidering parms // we have to do a couple rebuilds. // reset page round counts if ( we->m_updatedRound && cx ) { // Spider.cpp will reset the *ThisRound page counts and // the sent notification flag spiderRoundIncremented ( cx ); } // basically resetting the spider here... if ( we->m_doRebuilds && cx ) { // . this tells Spider.cpp to rebuild the spider queues // . this is NULL if spider stuff never initialized yet, // like if you just added the collection if ( cx->m_spiderColl ) cx->m_spiderColl->m_waitingTreeNeedsRebuild = true; // . assume we have urls ready to spider too // . no, because if they change the filters and there are // still no urls to spider i don't want to get another // email alert!! //cr->m_localCrawlInfo .m_hasUrlsReadyToSpider = true; //cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true; // . reconstruct the url filters if we were a custom crawl // . this is used to abstract away the complexity of url // filters in favor of simple regular expressions and // substring matching for diffbot cx->rebuildUrlFilters(); } if ( we->m_rebuildActiveList && cx ) g_spiderLoop.m_activeListValid = false; // if user changed the list of proxy ips rebuild the binary // array representation of the proxy ips we have if ( we->m_doProxyRebuild ) buildProxyTable(); // note it if ( ! we->m_sentReply ) log("parms: sending parm update reply"); // send back reply now. empty reply for the most part if ( we->m_errno && ! we->m_sentReply ) g_udpServer.sendErrorReply ( we->m_slot,we->m_errno,0 ); else if ( ! we->m_sentReply ) g_udpServer.sendReply_ass ( NULL,0,NULL,0,we->m_slot); // all done mfree ( we , sizeof(WaitEntry) , "weparm" ); return; } // . host #0 is requesting that we update some parms // . the readbuf in the request is the list of the parms void handleRequest3f ( UdpSlot *slot , int32_t niceness ) { // sending to host #0 is not right... //if ( g_hostdb.m_hostId == 0 ) { char *xx=NULL;*xx=0; } char *parmRecs = slot->m_readBuf; char *parmEnd = parmRecs + slot->m_readBufSize; log("parms: got parm update request. size=%"INT32".", (int32_t)(parmEnd-parmRecs)); // make a new waiting entry WaitEntry *we ; we = (WaitEntry *) mmalloc ( sizeof(WaitEntry),"weparm"); if ( ! we ) { g_udpServer.sendErrorReply(slot,g_errno,60); return; } we->m_slot = slot; we->m_callback = handleRequest3fLoop; we->m_parmPtr = parmRecs; we->m_parmEnd = parmEnd; we->m_errno = 0; we->m_doRebuilds = false; we->m_rebuildActiveList = false; we->m_updatedRound = false; we->m_doProxyRebuild = false; we->m_collnum = -1; we->m_sentReply = 0; handleRequest3fLoop ( we ); } //// // // functions for syncing parms with host #0 // //// // 1. we do not accept any recs into rdbs until in sync with host #0 // 2. at startup we send the hash of all parms for each collrec and // for g_conf (collnum -1) to host #0, then he will send us all the // parms for a collrec (or g_conf) if we are out of sync. // 3. when host #0 changes a parm it lets everyone know via broadcastParmList() // 4. only host #0 may initiate parm changes. so don't let that go down! // 5. once in sync a host can drop recs for collnums that are invalid // 6. until in parm sync with host #0 reject adds to collnums we don't // have with ETRYAGAIN in Msg4.cpp void tryToSyncWrapper ( int fd , void *state ) { g_parms.syncParmsWithHost0(); } // host #0 just sends back an empty reply, but it will hit us with // 0x3f parmlist requests. that way it uses the same mechanism and can // guarantee ordering of the parm update requests void gotReplyFromHost0Wrapper ( void *state , UdpSlot *slot ) { // ignore his reply unless error? if ( g_errno ) { log("parms: got error syncing with host 0: %s. Retrying.", mstrerror(g_errno)); // re-try it! g_parms.m_triedToSync = false; } else { log("parms: synced with host #0"); // do not re-call g_loop.unregisterSleepCallback(NULL,tryToSyncWrapper); } g_errno = 0; } // returns false and sets g_errno on error, true otherwise bool Parms::syncParmsWithHost0 ( ) { if ( m_triedToSync ) return true; m_triedToSync = true; m_inSyncWithHost0 = false; // dont sync with ourselves if ( g_hostdb.m_hostId == 0 ) { m_inSyncWithHost0 = true; return true; } // only grunts for now can sync, not proxies, so stop if we are proxy if ( g_hostdb.m_myHost->m_type != HT_GRUNT ) { m_inSyncWithHost0 = true; return true; } SafeBuf hashList; if ( ! makeSyncHashList ( &hashList ) ) return false; // copy for sending SafeBuf sendBuf; if ( ! sendBuf.safeMemcpy ( &hashList ) ) return false; if ( sendBuf.getCapacity() != hashList.length() ){char *xx=NULL;*xx=0;} if ( sendBuf.length() != hashList.length() ){char *xx=NULL;*xx=0;} // allow udpserver to free it char *request = sendBuf.getBufStart(); int32_t requestLen = sendBuf.length(); sendBuf.detachBuf(); Host *h = g_hostdb.getHost(0); log("parms: trying to sync with host #0"); // . send it off. use 3e i guess // . host #0 will reply using msg4 really // . msg4 guarantees ordering of requests // . there will be a record that is CMD_INSYNC so when we get // that we set g_parms.m_inSyncWithHost0 to true if ( ! g_udpServer.sendRequest ( request ,//hashList.getBufStart() , requestLen, //hashList.length() , 0x3e , // msgtype h->m_ip, // ip h->m_port, // port h->m_hostId , // hostid , host #0!!! NULL, // retslot NULL , // state gotReplyFromHost0Wrapper , 99999999 ) ) { // timeout in secs log("parms: error syncing with host 0: %s",mstrerror(g_errno)); return false; } // wait now return true; } // . here host #0 is receiving a sync request from another host // . host #0 scans this list of hashes to make sure the requesting host is // in sync // . host #0 will broadcast parm updates by calling broadcastParmList() which // uses 0x3f, so this just returns and empty reply on success // . sends CMD "addcoll" and "delcoll" cmd parms as well // . include an "insync" command parm as last parm void handleRequest3e ( UdpSlot *slot , int32_t niceness ) { // right now we must be host #0 if ( g_hostdb.m_hostId != 0 ) { g_errno = EBADENGINEER; hadError: g_udpServer.sendErrorReply(slot,g_errno,60); return; } // // 0. scan our collections and clear a flag // for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // skip if empty CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; // clear flag cr->m_hackFlag = 0; } Host *host = slot->m_host; int32_t hostId = -1; if ( host ) hostId = host->m_hostId; SafeBuf replyBuf; // // 1. update parms on collections we both have // 2. tell him to delete collections we do not have but he does // SafeBuf tmp; char *p = slot->m_readBuf; char *pend = p + slot->m_readBufSize; for ( ; p < pend ; ) { // get collnum collnum_t c = *(collnum_t *)p; p += sizeof(collnum_t); // then coll NAME hash uint32_t collNameHash32 = *(int32_t *)p; p += 4; // sanity check. -1 means g_conf. i guess. if ( c < -1 ) { char *xx=NULL;*xx=0; } // and parm hash int64_t h64 = *(int64_t *)p; p += 8; // if we being host #0 do not have this collnum tell // him to delete it! CollectionRec *cr = NULL; if ( c >= 0 ) cr = g_collectiondb.getRec ( c ); // if collection names are different delete it if ( cr && collNameHash32 != hash32n ( cr->m_coll ) ) { log("sync: host had collnum %i but wrong name, " "name not %s like it should be",(int)c,cr->m_coll); cr = NULL; } if ( c >= 0 && ! cr ) { // note in log logf(LOG_INFO,"sync: telling host #%"INT32" to delete " "collnum %"INT32"", hostId,(int32_t)c); // add the parm rec as a parm cmd if (! g_parms.addNewParmToList1( &replyBuf, c, NULL, -1, "delete")) goto hadError; // ok, get next collection hash continue; } // set our hack flag so we know he has this collection if ( cr ) cr->m_hackFlag = 1; // get our parmlist for that collnum tmp.reset(); // c is -1 for g_conf if ( ! g_parms.addAllParmsToList ( &tmp, c ) ) goto hadError; // get checksum of that int64_t m64 = hash64 ( tmp.getBufStart(),tmp.length() ); // if match, keep chugging, that's in sync if ( h64 == m64 ) continue; // note in log logf(LOG_INFO,"sync: sending all parms for collnum %"INT32" " "to host #%"INT32"", (int32_t)c, hostId); // otherwise, send him the list if ( ! replyBuf.safeMemcpy ( &tmp ) ) goto hadError; } // // 3. now if he's missing one of our collections tell him to add it // for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // skip if empty CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; // clear flag if ( cr->m_hackFlag ) continue; //char *cmdStr = "addColl"; // now use lowercase, not camelcase char *cmdStr = "addcoll"; if ( cr->m_isCustomCrawl == 1 ) cmdStr = "addCrawl"; if ( cr->m_isCustomCrawl == 2 ) cmdStr = "addBulk"; // note in log logf(LOG_INFO,"sync: telling host #%"INT32" to add " "collnum %"INT32" coll=%s", hostId,(int32_t)cr->m_collnum, cr->m_coll); // add the parm rec as a parm cmd if ( ! g_parms.addNewParmToList1 ( &replyBuf, (collnum_t)i, cr->m_coll, // parm val -1, cmdStr ) ) goto hadError; // and the parmlist for it if (!g_parms.addAllParmsToList (&replyBuf, i ) ) goto hadError; } // . final parm is the in sync stamp of approval which will set // g_parms.m_inSyncWithHost0 to true. CommandInSync() // . use -1 for collnum for this cmd if ( ! g_parms.addNewParmToList1 ( &replyBuf,-1,NULL,-1,"insync")) goto hadError; // this should at least have the in sync command log("parms: sending %"INT32" bytes of parms to sync to host #%"INT32"", replyBuf.length(),hostId); // . use the broadcast call here so things keep their order! // . we do not need a callback when they have been completely // broadcasted to all hosts so use NULL for that // . crap, we only want to send this to host #x ... g_parms.broadcastParmList ( &replyBuf , NULL , NULL , true , // sendToGrunts? false , // sendToProxies? hostId ); // but do send back an empty reply to this 0x3e request g_udpServer.sendReply_ass ( NULL,0,NULL,0,slot); // send that back now //g_udpServer.sendReply_ass ( replyBuf.getBufStart() , // replyBuf.length() , // replyBuf.getBufStart() , // replyBuf.getCapacity() , // slot ); // udpserver will free it //replyBuf.detachBuf(); } // get the hash of every collection's parmlist bool Parms::makeSyncHashList ( SafeBuf *hashList ) { SafeBuf tmp; // first do g_conf, collnum -1! for ( int32_t i = -1 ; i < g_collectiondb.m_numRecs ; i++ ) { // shortcut CollectionRec *cr = NULL; if ( i >= 0 ) cr = g_collectiondb.m_recs[i]; // skip if empty if ( i >=0 && ! cr ) continue; // clear since last time tmp.reset(); // g_conf? if i is -1 do g_conf if ( ! addAllParmsToList ( &tmp , i ) ) return false; // store collnum first as 4 bytes if ( ! hashList->safeMemcpy ( &i , sizeof(collnum_t) ) ) return false; // then store the collection name hash, 32 bit hash uint32_t collNameHash32 = 0; if ( cr ) collNameHash32 = hash32n ( cr->m_coll ); if ( ! hashList->safeMemcpy ( &collNameHash32, 4 ) ) return false; // hash the parms int64_t h64 = hash64 ( tmp.getBufStart(),tmp.length() ); // and store it if ( ! hashList->pushLongLong ( h64 ) ) return false; } return true; } int32_t Parm::getNumInArray ( collnum_t collnum ) { char *obj = (char *)&g_conf; if ( m_obj == OBJ_COLL ) { CollectionRec *cr = g_collectiondb.getRec ( collnum ); if ( ! cr ) return -1; obj = (char *)cr; } // # in array is before it return *(int32_t *)(obj+m_off-4); } // . we use this for syncing parms between hosts // . called by convertAllCollRecsToParmList // . returns false and sets g_errno on error // . "rec" can be CollectionRec or g_conf ptr bool Parms::addAllParmsToList ( SafeBuf *parmList, collnum_t collnum ) { // loop over parms for ( int32_t i = 0 ; i < m_numParms ; i++ ) { // get it Parm *parm = &m_parms[i]; // skip comments if ( parm->m_type == TYPE_COMMENT ) continue; if ( parm->m_type == TYPE_FILEUPLOADBUTTON ) continue; // cmds if ( parm->m_type == TYPE_CMD ) continue; if ( parm->m_type == TYPE_BOOL2 ) continue; // daily merge last started. do not sync this... if ( parm->m_type == TYPE_LONG_CONST ) continue; if ( collnum == -1 && parm->m_obj != OBJ_CONF ) continue; if ( collnum >= 0 && parm->m_obj != OBJ_COLL ) continue; if ( collnum < -1 ) { char *xx=NULL;*xx=0; } // like 'statsdb max cache mem' etc. if ( parm->m_flags & PF_NOSYNC ) continue; // sanity, need cgi hash to look up the parm on the // receiving end if ( parm->m_cgiHash == 0 ) { log("parms: no cgi for parm %s",parm->m_title); char *xx=NULL; *xx=0; } int32_t occNum = -1; int32_t maxOccNum = 0; if ( parm->isArray() ) { maxOccNum = parm->getNumInArray(collnum) ; occNum = 0; } for ( ; occNum < maxOccNum ; occNum ++ ) { // add each occ # to list if ( ! addCurrentParmToList2 ( parmList , collnum , occNum , parm ) ) return false; /* // // use this to debug parm list checksums being off // int64_t h64 ; h64 = hash64 ( parmList->getBufStart(), parmList->length() ); // note it for debugging hash SafeBuf xb; parm->printVal ( &xb ,collnum,occNum); log("parms: adding (h=%"XINT64") parm %s = %s", h64,parm->m_title,xb.getBufStart()); */ } } return true; } void resetImportLoopFlag () ; // . this adds the key if not a cmd key to parmdb rdbtree // . this executes cmds // . this updates the CollectionRec which may disappear later and be fully // replaced by Parmdb, just an RdbTree really. // . returns false if blocked // . returns true and sets g_errno on error bool Parms::updateParm ( char *rec , WaitEntry *we ) { collnum_t collnum = getCollnumFromParmRec ( rec ); g_errno = 0; Parm *parm = getParmFromParmRec ( rec ); if ( ! parm ) { log("parmdb: could not find parm for rec"); g_errno = EBADENGINEER; return true; } // cmd to execute? if ( parm->m_type == TYPE_CMD || // sitelist is a safebuf but it requires special deduping // logic to update it so it uses CommandUpdateSiteList() to // do the updating parm->m_func ) { // all parm rec data for TYPE_CMD should be ascii/utf8 chars // and should be \0 terminated char *data = getDataFromParmRec ( rec ); int32_t dataSize = getDataSizeFromParmRec ( rec ); if ( dataSize == 0 ) data = NULL; log("parmdb: running function for " "parm \"%s\" (collnum=%"INT32") args=\"%s\"" , parm->m_title , (int32_t)collnum , data ); // sets g_errno on error if ( parm->m_func ) { parm->m_func ( rec ); return true; } // fix core from using "roundstart=1" on non-existent coll if ( ! parm->m_func2 ) { return true; } // . returns true and sets g_errno on error // . returns false if blocked // . this is for CommandDeleteColl() and CommandResetColl() if ( parm->m_func2 ( rec , we ) ) return true; // . it did not complete. // . we need to re-call it using sleep wrapper above return false; } // "cr" will remain null when updating g_conf and collnum -1 CollectionRec *cr = NULL; if ( collnum >= 0 ) { cr = g_collectiondb.getRec ( collnum ); if ( ! cr ) { char *ps = "unknown parm"; if ( parm ) ps = parm->m_title; log("parmdb: invalid collnum %"INT32" for parm \"%s\"", (int32_t)collnum,ps); g_errno = ENOCOLLREC; return true; } } // what are we updating? void *base = NULL; // we might have a collnum specified even if parm is global, // maybe there are some collection/local parms specified as well // that that collnum applies to if ( parm->m_obj == OBJ_COLL ) base = cr; else base = &g_conf; if ( ! base ) { log("parms: no collrec (%"INT32") to change parm",(int32_t)collnum); g_errno = ENOCOLLREC; return true; } int32_t occNum = getOccNumFromParmRec ( rec ); // get data int32_t dataSize = *(int32_t *)(rec+sizeof(key96_t)); char *data = rec+sizeof(key96_t)+4; // point to where to copy the data into collrect char *dst = (char *)base + parm->m_off; // point to count in case it is an array int32_t *countPtr = NULL; // array? if ( parm->isArray() ) { if ( occNum < 0 ) { log("parms: bad occnum for %s",parm->m_title); return false; } // point to count in case it is an array countPtr = (int32_t *)(dst - 4); // now point "dst" to the occNum-th element dst += parm->m_size * occNum; } // // compare parm to see if it changed value // SafeBuf val1; parm->printVal ( &val1 , collnum , occNum ); // if parm is a safebuf... if ( parm->m_type == TYPE_SAFEBUF ) { // point to it SafeBuf *sb = (SafeBuf *)dst; // nuke it sb->purge(); // require that the \0 be part of the update i guess //if ( ! data || dataSize <= 0 ) { char *xx=NULL;*xx=0; } // check for \0 if ( data && dataSize > 0 ) { if ( data[dataSize-1] != '\0') { char *xx=NULL;*xx=0;} // this means that we can not use string POINTERS as // parms!! don't include \0 as part of length sb->safeStrcpy ( data ); // , dataSize ); // ensure null terminated sb->nullTerm(); sb->setLabel("parm2"); } //return true; // sanity // we no longer include the \0 in the dataSize...so a dataSize // of 0 means empty string... //if ( data[dataSize-1] != '\0' ) { char *xx=NULL;*xx=0; } } else { // and copy the data into collrec or g_conf gbmemcpy ( dst , data , dataSize ); } SafeBuf val2; parm->printVal ( &val2 , collnum , occNum ); // did this parm change value? bool changed = true; if ( strcmp ( val1.getBufStart() , val2.getBufStart() ) == 0 ) changed = false; // . update array count if necessary // . parm might not have changed value based on what was in there // by default, but for PAGE_FILTERS the default value in the row // for this parm might have been zero! so we gotta update its // "count" in that scenario even though the parm val was unchanged. if ( parm->isArray() ) { // the int32_t before the array is the # of elements int32_t currentCount = *countPtr; // update our # elements in our array if this is bigger int32_t newCount = occNum + 1; bool updateCount = false; if ( newCount > currentCount ) updateCount = true; // do not update counts if we are url filters // and we are currently >= the expression count. we have // to have a non-empty expression at the end in order to // add the expression. this prevents the empty line from // being added! if ( parm->m_page == PAGE_FILTERS && cr->m_regExs[occNum].getLength() == 0 ) updateCount = false; // and for other pages, like master ips, skip if empty! // PAGE_PASSWORDS, PAGE_MASTERPASSWORDS, ... if ( parm->m_page != PAGE_FILTERS && ! changed ) updateCount = false; // ok, increment the array count of items in the array if ( updateCount ) *countPtr = newCount; } // all done if value was unchanged if ( ! changed ) return true; // show it log("parms: updating parm \"%s\" " "(%s[%"INT32"]) (collnum=%"INT32") from \"%s\" -> \"%s\"", parm->m_title, parm->m_cgi, occNum, (int32_t)collnum, val1.getBufStart(), val2.getBufStart()); if ( cr ) cr->m_needsSave = true; // HACK #2 if ( base == cr && dst == (char *)&cr->m_importEnabled ) resetImportLoopFlag(); // // HACK // // special hack. if spidering re-enabled then reset last spider // attempt time to 0 to avoid the "has no more urls to spider" // msg followed by the reviving url msg. if ( base == cr && dst == (char *)&cr->m_spideringEnabled ) cr->m_localCrawlInfo.m_lastSpiderAttempt = 0; if ( base == &g_conf && dst == (char *)&g_conf.m_spideringEnabled ){ for(int32_t i = 0;im_localCrawlInfo.m_lastSpiderAttempt = 0; } } // // if user changed the crawl/process max then reset here so // spiders will resume // if ( base == cr && dst == (char *)&cr->m_maxToCrawl && cr->m_spiderStatus == SP_MAXTOCRAWL ) { // reset this for rebuilding of active spider collections // so this collection can be in the linked list again cr->m_spiderStatus = SP_INPROGRESS; // rebuild list of active spider collections then g_spiderLoop.m_activeListValid = false; } if ( base == cr && dst == (char *)&cr->m_maxToProcess && cr->m_spiderStatus == SP_MAXTOPROCESS ) { // reset this for rebuilding of active spider collections // so this collection can be in the linked list again cr->m_spiderStatus = SP_INPROGRESS; // rebuild list of active spider collections then g_spiderLoop.m_activeListValid = false; } if ( base == cr && dst == (char *)&cr->m_maxCrawlRounds && cr->m_spiderStatus == SP_MAXROUNDS ) { // reset this for rebuilding of active spider collections // so this collection can be in the linked list again cr->m_spiderStatus = SP_INPROGRESS; // rebuild list of active spider collections then g_spiderLoop.m_activeListValid = false; } // // END HACK // // all done return true; } bool Parm::printVal ( SafeBuf *sb , collnum_t collnum , int32_t occNum ) { CollectionRec *cr = NULL; if ( collnum >= 0 ) cr = g_collectiondb.getRec ( collnum ); // no value if no storage record offset //if ( m_off < 0 ) return true; char *base; if ( m_obj == OBJ_COLL ) base = (char *)cr; else base = (char *)&g_conf; if ( ! base ) { log("parms: no collrec (%"INT32") to change parm",(int32_t)collnum); g_errno = ENOCOLLREC; return true; } // point to where to copy the data into collrect char *val = (char *)base + m_off; if ( isArray() && occNum < 0 ) { log("parms: bad occnum for %s",m_title); return false; } // add array index to ptr if ( isArray() ) val += m_size * occNum; if ( m_type == TYPE_SAFEBUF ) { // point to it SafeBuf *sb2 = (SafeBuf *)val; return sb->safePrintf("%s",sb2->getBufStart()); } if ( m_type == TYPE_STRING || m_type == TYPE_STRINGBOX || m_type == TYPE_SAFEBUF || m_type == TYPE_STRINGNONEMPTY ) return sb->safePrintf("%s",val); if ( m_type == TYPE_LONG || m_type == TYPE_LONG_CONST ) return sb->safePrintf("%"INT32"",*(int32_t *)val); if ( m_type == TYPE_DATE ) return sb->safePrintf("%"INT32"",*(int32_t *)val); if ( m_type == TYPE_DATE2 ) return sb->safePrintf("%"INT32"",*(int32_t *)val); if ( m_type == TYPE_FLOAT ) return sb->safePrintf("%f",*(float *)val); if ( m_type == TYPE_LONG_LONG ) return sb->safePrintf("%"INT64"",*(int64_t *)val); if ( m_type == TYPE_CHARPTR ) { if ( val ) return sb->safePrintf("%s",val); return true; } if ( m_type == TYPE_BOOL || m_type == TYPE_BOOL2 || m_type == TYPE_CHECKBOX || m_type == TYPE_PRIORITY2 || m_type == TYPE_UFP || m_type == TYPE_CHAR ) return sb->safePrintf("%hhx",*val); if ( m_type == TYPE_CMD ) return sb->safePrintf("CMD"); if ( m_type == TYPE_IP ) // may print 0.0.0.0 return sb->safePrintf("%s",iptoa(*(int32_t *)val) ); log("parms: missing parm type!!"); char *xx=NULL;*xx=0; return false; } bool printUrlExpressionExamples ( SafeBuf *sb ) { /* CollectionRec *cr = (CollectionRec *)THIS; // if testUrl is provided, find in the table char testUrl [ 1025 ]; char *tt = r->getString ( "qatest123" , NULL ); testUrl[0]='\0'; if ( tt ) strncpy ( testUrl , tt , 1024 ); char *tu = testUrl; if ( ! tu ) tu = ""; char matchString[12]; matchString[0] = '\0'; if ( testUrl[0] ) { Url u; u.set ( testUrl , gbstrlen(testUrl) ); //since we don't know the doc's quality, sfn, or //other stuff, just give default values int32_t n = cr->getRegExpNum ( &u , false , // links2gb? false , // searchboxToGB false , // onsite? -1 , // docQuality -1 , // hopCount false , // siteInDmoz? //-1 , // ruleset # -1 , // langId -1 , // parent priority 0 , // niceness NULL , // tagRec false , // isRSS? false , // isPermalink? false , // new outlink? -1 , // age NULL , // LinkInfo NULL , // parentUrl -1 , // priority false , // isAddUrl false , // parentRSS? false , // parentIsNew? false , // parentIsPermlnk false );// isIndexed? if ( n == -1 ) sprintf ( matchString , "default" ); else sprintf ( matchString, "%"INT32"", n+1 ); } // test table sb.safePrintf ( //"
" //"" "
" //"" "" "URL Filters Test" //"" "" "" "" "To test your URL filters simply enter a URL into " "this box and submit it. The URL filter line number " "that it matches will be displayed to the right." "" "" "" "Test URL" "Matching Expression #" "" "" " " "" "%s

\n" , LIGHT_BLUE , DARK_BLUE , testUrl , matchString ); */ sb->safePrintf( "\n" , LIGHT_BLUE ); sb->safePrintf ( "" "" "" "" "" "" "" "" "" "" "" "" /* "" "" "" "" "" "" "" "" */ "" "" "" "" "" "" // MDW: 7/11/2014 take this out until it works. // problem is that the quota table m_localTable // in Spider.cpp gets reset for each firstIp scan, // and we have a.walmart.com and b.walmart.com // with different first ips even though on same // domain. perhaps we should use the domain as the // key to getting the firstip for and subdomain. // but out whole selection algo in spider.cpp is // firstIp based, so it scans all the spiderrequests // from a single firstip to get the winner for that // firstip. // "" // "" "" "" // taken out for the same reason as domainpages // above was taken out. see expanation up there. // "" // "" "" "" "" "" //"" //"" "" "" //"" //"" /* "" "" */ "" "" "" "" "" "" "" "" "" "" //"" //"" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" /* "" "" */ "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" /* "" "" "" "" "" "" */ //"NOTE: Until we get the link info to get the doc " //"quality before calling msg8 in Msg16.cpp, we " //"can not involve doc:quality for purposes of " //"assigning a ruleset, unless banning it." "" "" "" "" //"NOTE: Until we move the language " //"detection up before any call to XmlDoc::set1() " //"in Msg16.cpp, we can not use for purposes of " //"assigning a ruleset, unless banning it." //"" "" "" "" "" /* "" "" "" "" "" "" "" "" */ "" "" "" "" "" "" "" "" "" "" "" "" "
" "" "Supported Expressions" "
defaultMatches every url." "
^http://whateverMatches if the url begins with " "http://whatever" "
$.cssMatches if the url ends with \".css\"." "
foobarMatches if the url CONTAINS foobar." "
tld==uk,jpMatches if url's TLD ends in \"uk\" or \"jp\"." "
doc:quality<40Matches if document quality is " "less than 40. Can be used for assigning to spider " "priority.
doc:quality<40 && tag:ruleset==22Matches if document quality less than 40 and " "belongs to ruleset 22. Only for assigning to " "spider priority.
" "doc:quality<40 && tag:manualban==1Matches if document quality less than 40 and " "is has a value of \"1\" for its \"manualban\" " "tag.
tag:ruleset==33 && doc:quality<40Matches if document quality less than 40 and " "belongs to ruleset 33. Only for assigning to " "spider priority or a banned ruleset.
" "hopcount<4 && iswwwMatches if document has a hop count of 4, and " "is a \"www\" url (or domain-only url).
hopcountAll root urls, those that have only a single " "slash for their path, and no cgi parms, have a " "hop count of 0. Also, all RSS urls, ping " "server urls and site roots (as defined in the " "site rules table) have a hop count of 0. Their " "outlinks have a hop count of 1, and the outlinks " "of those outlinks a hop count of 2, etc." "
sitepagesThe number of pages that are currently indexed " "for the subdomain of the URL. " "Used for doing quotas." "
domainpagesThe number of pages that are currently indexed " // "for the domain of the URL. " // "Used for doing quotas." // "
siteaddsThe number URLs manually added to the " "subdomain of the URL. Used to gauge a subdomain's " "popularity." "
domainaddsThe number URLs manually added to the " // "domain of the URL. Used to gauge a domain's " // "popularity." // "
isrss | !isrssMatches if document is an RSS feed. Will " "only match this rule if the document has been " "successfully spidered before, because it requires " "downloading the document content to see if it " "truly is an RSS feed.." "
isrssext | !isrssextMatches if url ends in .xml .rss or .atom. " "TODO: Or if the link was in an " "alternative link tag." "
!isrssMatches if document is NOT an rss feed." //"
ispermalink | !ispermalinkMatches if document is a permalink. " "When harvesting outlinks we guess if they " "are a permalink by looking at the structure " "of the url.
!ispermalinkMatches if document is NOT a permalink." //"
outlink | !outlink" "This is true if url being added to spiderdb " "is an outlink from the page being spidered. " "Otherwise, the url being added to spiderdb " "directly represents the page being spidered. It " "is often VERY useful to partition the Spiderdb " "records based on this criteria." "
isnewoutlink | !isnewoutlink" "" "This is true since the outlink was not there " "the last time we spidered the page we harvested " "it from." "
hasreply | !hasreply" "This is true if we have tried to spider " "this url, even if we got an error while trying." "
isnew | !isnew" "This is the opposite of hasreply above. A url " "is new if it has no spider reply, including " "error replies. So once a url has been attempted to " "be spidered then this will be false even if there " "was any kind of error." "
lastspidertime >= " "{roundstart}" "This is true if the url's last spidered time " "indicates it was spidered already for this " "current round of spidering. When no more urls " "are available for spidering, then gigablast " "automatically sets {roundstart} to the current " "time so all the urls can be spidered again. This " "is how you do round-based spidering. " "You have to use the respider frequency as well " "to adjust how often you want things respidered." "
urlage" "This is the time, in seconds, since a url was first " "added to spiderdb to be spidered. This is " "its discovery date. " "Can use <, >, <=, >=, ==, != comparison operators." "
!newoutlinkMatches if document is NOT a new outlink." //"
age" "How old is the document in seconds. " "The age is based on the publication date of " "the document, which could also be the " "time that the document was last significantly " "modified. If this date is unknown then the age " "will be -1 and only match the expression " "age==-1. " "When harvesting links, we guess the publication " "date of the oulink by detecting dates contained " "in the url itself, which is popular among some " "forms of permalinks. This allows us to put " "older permalinks into a slower spider queue." "
spiderwaited < 3600" "spiderwaited is how many seconds have elapsed " "since the last time " "we tried to spider/download the url. " "The constaint containing spiderwaited will " "fail to be matched if the url has never been " "attempted to be spidered/downloaded before. Therefore, " "it will only ever match urls that have a spider reply " "of some sort, so there is no need to add an additional " "hasreply-based constraint." "
" "" "insitelist | !insitelist" "" "" "This is true if the url matches a pattern in " "the list of sites on the " "site list page. That site list is useful for " "adding a large number of sites that can not be " "accommodated by the url filters table. Plus " "it is higher performance and easier to use, but " "lacks the url filter table's " "fine level of control." "
" "" "isaddurl | !isaddurl" "" "" "This is true if the url was added from the add " "url interface or API." //"This replaces the add url priority " //"parm." "
isinjected | !isinjected" "This is true if the url was directly " "injected from the " "inject page or API." "
isreindex | !isreindex" "This is true if the url was added from the " "query reindex " "interface. The request does not contain " "a url, but only a docid, that way we can add " "millions of search results very quickly without " "having to lookup each of their urls. You should " "definitely have this if you use the reindexing " "feature. " "You can set max spiders to 0 " "for non " "isreindex requests while you reindex or delete " "the results of a query for extra speed." "
ismanualadd | !ismanualadd" "This is true if the url was added manually. " "Which means it matches isaddurl, isinjected, " " or isreindex. as opposed to only " "being discovered from the spider. " "
inpingserver | !inpingserver" "" "This is true if the url has an inlink from " "a recognized ping server. Ping server urls are " "hard-coded in Url.cpp. " "pingserver urls are assigned a hop count of 0" "" "
isparentrss | !isparentrss" "If a parent of the URL was an RSS page " "then this will be matched." "
isparentsitemap | " "!isparentsitemap" "If a parent of the URL was a sitemap.xml page " "then this will be matched." "
parentisnew | !parentisnew" "Parent providing this outlink is not currently " "in the index but is trying to be added right now. " "This is a special expression in that " "it only applies to assigning spider priorities " "to outlinks we are harvesting on a page." "
isindexed | !isindexed" "This url matches this if in the index already. " "
errorcount==1" "The number of times the url has failed to " "be indexed. 1 means just the last time, two means " "the last two times. etc. Any kind of error parsing " "the document (bad utf8, bad charset, etc.) " "or any HTTP status error, like 404 or " "505 is included in this count, in addition to " "\"temporary\" errors like DNS timeouts." "
errorcode==32880" "If the last time it was spidered it had this " "numeric error code. See the error codes in " "Errno.cpp. In this particular example 32880 is " "for EBADURL." "
hastmperror" "This is true if the last spider attempt resulted " "in an error like EDNSTIMEDOUT or a similar error, " "usually indicative of a temporary internet " "failure, or local resource failure, like out of " "memory, and should be retried soon. " "Currently: " "dns timed out, " "tcp timed out, " "dns dead, " "network unreachable, " "host unreachable, " "diffbot internal error, " "out of memory." "
percentchangedperday<=5" "Looks at how much a url's page content has changed " "between the last two times it was spidered, and " "divides that percentage by the number of days. " "So if a URL's last two downloads were 10 days " "apart and its page content changed 30%% then " "the percentchangedperday will be 3. " "Can use <, >, <=, >=, ==, != comparison operators. " "
sitenuminlinks>20" "How many inlinks does the URL's site have? " "We only count non-spammy inlinks, and at most only " "one inlink per IP address C-Class is counted " "so that a webmaster who owns an entire C-Class " "of IP addresses will only have his inlinks counted " "once." "Can use <, >, <=, >=, ==, != comparison operators. " "
numinlinks>20" "How many inlinks does the URL itself have? " "We only count one link per unique C-Class IP " "address " "so that a webmaster who owns an entire C-Class " "of IP addresses will only have her inlinks counted " "once." "Can use <, >, <=, >=, ==, != comparison operators. " "This is useful for spidering popular URLs quickly." "
httpstatus==404" "For matching the URL based on the http status " "of its last download. Does not apply to URLs " "that have not yet been successfully downloaded." "Can use <, >, <=, >=, ==, != comparison operators. " "
priority==30" "If the current priority of the url is 30, then " "it will match this expression. Does not apply " "to outlinks, of course." "
parentpriority==30" "This is a special expression in that " "it only applies to assigning spider priorities " "to outlinks we are harvesting on a page. " "Matches if the url being added to spider queue " "is from a parent url in priority queue 30. " "The parent's priority queue is the one it got " "moved into while being spidered. So if it was " "in priority 20, but ended up in 25, then 25 will " "be used when scanning the URL Filters table for " "each of its outlinks. Only applies " "to the FIRST time the url is added to spiderdb. " "Use parentpriority==-3 to indicate the " "parent was FILTERED and -2 to indicate " "the parent was BANNED. A parentpriority of " "-1" " means that the urls is not a link being added to " "spiderdb but rather a url being spidered." "
inlink==..." "If the url has an inlinker which contains the " "given substring, then this rule is matched. " "We use this like inlink=www.weblogs.com/" "int16_tChanges.xml to detect if a page is in " "the ping server or not, and if it is, then we " "assign it to a slower-spidering queue, because " "we can reply on the ping server for updates. Saves " "us from having to spider all the blogspot.com " "subdomains a couple times a day each." "
tld!=com,org,edu"// && " //"doc:quality<70" "Matches if the " "url's TLD does NOT end in \"com\", \"org\" or " "\"edu\". " "
lang==zh_cn,de" "Matches if " "the url's content is in the language \"zh_cn\" or " "\"de\". See table below for supported language " "abbreviations. Used to only keep certain languages " "in the index. This is hacky because the language " "may not be known at spider time, so Gigablast " "will check after downloading the document to " "see if the language spider priority is " "DELETE thereby discarding it.
lang!=xx,en,de" "Matches if " "the url's content is NOT in the language \"xx\" " "(unknown), \"en\" or \"de\". " "See table below for supported language " "abbreviations.
parentlang==zh_cn,zh_tw,xx" "Matches if " "the url's referring parent url is primarily in " "this language. Useful for prioritizing spidering " "pages of a certain language." "See table below for supported language " "abbreviations." "
link:gigablastMatches if the document links to gigablast." "
searchbox:gigablastMatches if the document has a submit form " "to gigablast." "
site:dmozMatches if the document is directly or " "indirectly in the DMOZ directory." "
tag:spam>XMatches if the document's tagdb record " "has a score greater than X for the sitetype, " "'spam' in this case. " "Can use <, >, <=, >=, ==, != comparison operators. " "Other sitetypes include: " "..." "
iswww | !iswwwMatches if the url's hostname is www or domain " "only. For example: www.xyz.com would match, " "and so would abc.com, but " "foo.somesite.com would NOT match." "
isroot | !isrootMatches if the URL is a root URL. Like if " "its path is just '/'. Example: http://www.abc.com " "is a root ur but http://www.abc.com/foo is not. " "
isonsamedomain | !isonsamedomain" "This is true if the url is from the same " "DOMAIN as the page from which it was " "harvested." //"Only effective for links being added from a page " //"being spidered, because this information is " //"not preserved in the titleRec." "
" "isonsamesubdomain | !isonsamesubdomain" "" "This is true if the url is from the same " "SUBDOMAIN as the page from which it was " "harvested." //"Only effective for links being added from a page " //"being spidered, because this information is " //"not preserved in the titleRec." "
ismedia | !ismedia" "Does the url have a media or css related " "extension. Like gif, jpg, mpeg, css, etc.? " "
tag:tagname" "This is true if the url is tagged with this " "tagname in the site list. Read about tags " "on the "//#examples>" "site list " "page." "


\n", TABLE_STYLE ); // show the languages you can use sb->safePrintf ( "" "", TABLE_STYLE ); for ( int32_t i = 0 ; i < 256 ; i++ ) { char *lang1 = getLanguageAbbr ( i ); char *lang2 = getLanguageString ( i ); if ( ! lang1 ) continue; sb->safePrintf("" "\n", lang1,lang2); } // wrap it up sb->safePrintf("
" "" "Supported Language Abbreviations " "for lang== Filter" "
%s%s


"); return true; } // . copy/clone parms from one collrec to another // . returns false and sets g_errno on error // . if doing this after creating a new collection on host #0 we have to call // syncParmsWithHost0() to get all the shards in sync. bool Parms::cloneCollRec ( char *dstCR , char *srcCR ) { // now set THIS based on the parameters in the xml file for ( int32_t i = 0 ; i < m_numParms ; i++ ) { // get it Parm *m = &m_parms[i]; if ( m->m_obj != OBJ_COLL ) continue; //log(LOG_DEBUG, "Parms: %s: parm: %s", filename, m->m_xml); // . there are 2 object types, coll recs and g_conf, aka // OBJ_COLL and OBJ_CONF. // skip comments and command if ( !(m->m_flags & PF_CLONE) ) continue; // get parm data ptr char *src = srcCR + m->m_off; char *dst = dstCR + m->m_off; // if not an array use this if ( ! m->isArray() ) { if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *a = (SafeBuf *)src; SafeBuf *b = (SafeBuf *)dst; b->reset(); b->safeMemcpy ( a ); b->nullTerm(); } else { // this should work for most types gbmemcpy ( dst , src , m->m_size ); } continue; } // // arrays only below here // // for arrays only int32_t *srcNum = (int32_t *)(src-4); int32_t *dstNum = (int32_t *)(dst-4); // array can have multiple values for ( int32_t j = 0 ; j < *srcNum ; j++ ) { if ( m->m_type == TYPE_SAFEBUF ) { SafeBuf *a = (SafeBuf *)src; SafeBuf *b = (SafeBuf *)dst; b->reset(); b->safeMemcpy ( a ); b->nullTerm(); } else { // this should work for most types gbmemcpy ( dst , src , m->m_size ); } src += m->m_size; dst += m->m_size; } // update # elements in array *dstNum = *srcNum; } return true; }