# This Properties map is specified in the Java 'property list' text format # http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29 ### ### some of these overrides is actually just the default value, so they can be skipped ### ## Q: can overrides like 'fetchDns.enabled=false' be used to disable the beans? metadata.jobName=crawlrss_cxml metadata.description=Default Profile metadata.operator=Admin metadata.userAgentTemplate=Mozilla/5.0 (compatible; heritrix/3.3.0 +@OPERATOR_CONTACT_URL@) ## Edit the two following lines to match your setup. metadata.operatorContactUrl=http://netarkivet.dk/webcrawler/ metadata.operatorFrom=info@netarkivet.dk loggerModule.path=logs crawlLimiter.maxBytesDownload=0 crawlLimiter.maxDocumentsDownload=0 ## MaxTimeseconds inserted by NetarchiveSuite (Delete line, if behaviour unwanted) crawlLimiter.maxTimeSeconds=%{MAX_TIME_SECONDS_PLACEHOLDER} crawlController.maxToeThreads=50 crawlController.recorderOutBufferBytes=4096 crawlController.recorderInBufferBytes=65536 crawlController.pauseAtStart=false crawlController.scratchDir=scratch ## org.archive.bdb.BdbModule overrides bdb.dir=state bdb.cachePercent=40 ## seeds properties seeds.sourceTagSeeds=false ## Politeness overrides disposition.delayFactor=1.0 disposition.maxDelayMs=1000 disposition.minDelayMs=300 disposition.maxPerHostBandwidthUsageKbSec=500 ## preparer.preferenceEmbedHops=1 ## preparer.preferenceDepthHops=-1 frontier.maxRetries=3 frontier.retryDelaySeconds=300 frontier.recoveryLogEnabled=false frontier.balanceReplenishAmount=3000 frontier.errorPenaltyAmount=100 ## Can be used instead of the QuotaEnforcer module. In this case the following line should look ## like: frontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER} ## instead of: frontier.queueTotalBudget= frontier.queueTotalBudget=%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER} frontier.snoozeLongMs=300000 preselector.enabled=true preselector.logToFile=false preselector.recheckScope=true preselector.blockAll=false preconditions.enabled=true preconditions.ipValidityDurationSeconds=21600 preconditions.robotsValidityDurationSeconds=86400 preconditions.calculateRobotsOnly=false fetchDns.enabled=true fetchDns.acceptNonDnsResolves=false fetchDns.digestContent=true fetchDns.digestAlgorithm=sha1 fetchHttp.enabled=true fetchHttp.timeoutSeconds=1200 fetchHttp.soTimeoutMs=20000 fetchHttp.maxFetchKBSec=0 fetchHttp.maxLengthBytes=0 fetchHttp.ignoreCookies=false fetchHttp.sslTrustLevel=OPEN fetchHttp.defaultEncoding=ISO-8859-1 fetchHttp.digestContent=true fetchHttp.digestAlgorithm=sha1 fetchHttp.sendIfModifiedSince=true fetchHttp.sendIfNoneMatch=true fetchHttp.sendConnectionClose=true fetchHttp.sendReferer=true fetchHttp.sendRange=false extractorHttp.enabled=true extractorHtml.enabled=true extractorHtml.extractJavascript=true extractorHtml.treatFramesAsEmbedLinks=false extractorHtml.ignoreFormActionUrls=true extractorHtml.extractValueAttributes=false extractorHtml.ignoreUnexpectedHtml=true extractorCss.enabled=true extractorJs.enabled=true extractorSwf.enabled=true candidates.seedsRedirectNewSeeds=false statisticsTracker.intervalSeconds=20 warcWriter.compress=true %{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER} %{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER} https://www.dr.dk/nyheder/ http://www.dr.dk/nyheder/allenyheder/indland http://www.dr.dk/nyheder/allenyheder/udland http://www.dr.dk/nyheder/allenyheder/penge