www.pudn.com > heritrix-1.14.0-src.rar > heritrix.properties


##############################################################################
# HERITRIX PROPERTIES
##############################################################################

# Properties with prefixes 'heritrix.', 'org.archive.', or 'system.' prefix 
# get copied into System.properties on startup so available via 
# System.getProperties. (For 'system.' properties, that prefix is stripped.
# (See Heritrix.loadProperties()). 

# Version is filled in by the maven.xml pregoal. It copies here the project
# currentVersion property.
heritrix.version = @VERSION@

# Location of the heritrix jobs directory.
heritrix.jobsdir = jobs

# Default commandline startup values.
# Below values are used if unspecified on the command line.
heritrix.cmdline.admin = 
heritrix.cmdline.port = 8080
heritrix.cmdline.run = false
heritrix.cmdline.nowui = false
heritrix.cmdline.order =
heritrix.cmdline.jmxserver = false
heritrix.cmdline.jmxserver.port = 8081

##############################################################################
# LOGGING
##############################################################################

# Basic logging setup; to console, all levels.
# Note, after startup Heritrix adds a special logging handler called SinkHandler
# that keeps around all instances of severe and warning messages for display by
# the Heritrix alerting system.  We used to initialize it here with the
# ConsoleHandler but it wasn't being found when Heritrix was packaged as a webapp
# deployed in Tomcat.
handlers = java.util.logging.ConsoleHandler
java.util.logging.ConsoleHandler.level = ALL
java.util.logging.ConsoleHandler.formatter= org.archive.util.OneLineSimpleLogger
 
# Default global logging level: only warnings or higher
.level= WARNING

# Enable frontier INFO logging
# org.archive.crawler.frontier.BdbFrontier.level = FINER
# org.archive.crawler.frontier.BdbMultipleWorkQueues.level = INFO
# org.archive.crawler.frontier.AbstractFrontier.level = INFO
# org.archive.crawler.frontier.WorkQueueFrontier.level = INFO

# Currently necessary (?) for standard logs to work
crawl.level= INFO
runtime-errors.level= INFO
local-errors.level= INFO
uri-errors.level= INFO
progress-statistics.level= INFO
recover.level= INFO

# Set the selftest console logger level.
org.archive.crawler.admin.SelftestCrawlJobHandler.level = INFO

# Enable info level on CrawlJob so can see it registered jobs w/ mbeanserver
org.archive.crawler.admin.CrawlJob.level = INFO

# Enable the following lines to watch authentications running.
# org.archive.crawler.fetcher.FetchHTTP.level = FINE
# org.archive.crawler.prefetch.PreconditionEnforcer.level = FINE
 
# HttpClient is too chatty... only want to hear about severe problems
# For more on httpclient logging,
# see http://jakarta.apache.org/commons/httpclient/logging.html
org.apache.commons.httpclient.level = SEVERE

# If you need verbose console logging of HttpClient traffic, uncomment the
# following line: 
# httpclient.wire.level = FINE

# Enable ARCWriter.level line below to see logging of the opening and closing
# of arc files in console output (Console goes into heritrix_out.log). If you'd
# rather have console output, including the opening and closing of ARCs, go to
# a file, enable the FileHandler logger -- see the commented out 'handlers'
# line above -- and enable the lines below related to FileHandler (Configure to
# your preference -- the '%h' in the below means value of java.home system
# property, usually your home directory).  There does not seem to be a way
# other than in code to configure only ARCWriter writing to the FileHandler.
# Even if you do do code changes, its awkward making the log show in each
# individual jobs log directory. Note, the closing of arc files on shutdown is
# not logged.
# org.archive.io.arc.ARCWriter.level = INFO

# Enable to watch authentication progress amongst other details on http fetch.
# org.archive.crawler.fetcher.FetchHTTP.level = FINE

# Enable logging of already seen memory grows; this growing is probably
# main culprit for OOMEs.  Log goes into heritrix_out.log.
org.archive.util.MemLongFPSet.level = INFO
# org.archive.crawler.url.Canonicalizer.level = INFO

# Enable logging of uris rejected by scope by setting level to INFO.
# org.archive.crawler.postprocessor.LinksScoper.level = INFO

# Below values are used by classes that set their own FileHandler as default
# values. The pattern value is used as a suffix.  Note, the below configuration
# limits log file size (to 500000 bytes) and because count is 1, old log data
# will be lost.
java.util.logging.FileHandler.level = ALL
java.util.logging.FileHandler.pattern = %u.log
java.util.logging.FileHandler.formatter = org.archive.util.OneLineSimpleLogger
java.util.logging.FileHandler.limit = 500000
java.util.logging.FileHandler.count = 1

# org.archive.crawler.admin.StatisticsTracker.level = INFO

# Enable info logging so can see note on checkpoint completion in
# heritrix_out.log.
org.archive.crawler.framework.CrawlController.level = INFO

# AdaptiveRevist module
# org.archive.crawler.frontier.AdaptiveRevisitFrontier.level = FINE
# org.archive.crawler.frontier.AdaptiveRevisitHostQueue.level = FINE
# org.archive.crawler.frontier.AdaptiveRevisitQueueList.level = FINE
# org.archive.crawler.processor.ChangeEvaluator.level = FINE
# org.archive.crawler.processor.WaitEvaluator.level = FINE
# org.archive.crawler.processor.HTTPContentDigest.level = FINE

# ServerCache creation of crawl host (Make it FINER if you want to see 
# crawl server creation also).
# org.archive.crawler.datamodel.ServerCache.level = FINE

# Enable to see average alreadyseen lookup times.
# org.archive.crawler.util.BdbUriUniqFilter.level = INFO

# Enable to see duplicated-detection stats on each merge, 
# if using an FPMergeUriUniqFilter
# org.archive.crawler.util.FPMergeUriUniqFilter.level = INFO

# Enable to see report of duplicates encountered every 50K unique adds
# org.archive.crawler.util.SetBasedUriUniqFilter.level = FINE
# Used to set a log of all URIs presented to filter, for 
# later comparison profiling
# org.archive.crawler.util.SetBasedUriUniqFilter.profileLogFile = uriUniq.log

# Leave the below enabled so can see launch info messages on the console.
org.archive.crawler.Heritrix.level = INFO

# Leave RecoveryJournal level at INFO to see recovery progress 
org.archive.crawler.frontier.RecoveryJournal.level = INFO


#############################################################################
# FRONTIER
#############################################################################

# List here all queue assignment policies you'd have show as a
# queue-assignment-policy choice in AbstractFrontier derived Frontiers
# (e.g. BdbFrontier).
org.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy = \
    org.archive.crawler.frontier.HostnameQueueAssignmentPolicy \
    org.archive.crawler.frontier.IPQueueAssignmentPolicy \
    org.archive.crawler.frontier.BucketQueueAssignmentPolicy \
    org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy \
    org.archive.crawler.frontier.TopmostAssignedSurtQueueAssignmentPolicy
org.archive.crawler.frontier.BdbFrontier.level = INFO

##############################################################################
# UURI
##############################################################################
# Any scheme not listed in the below will generate an UnsupportedUriScheme
# exception.  Make the list empty to support all schemes. 
org.archive.net.UURIFactory.schemes = http, https, dns, invalid
# 'invalid' is a fake scheme used as a last-ditch fallback when a UURI 
# instance is expected and raising a URIException would be problematic (as
# in deserializing what was a valid UURI at serialization time). It may be 
# removed, but doing so could reduce crawl robustness in the face of 
# potential future bugs.

# following schemes will be marked as intentionally ignored in the
# exception -- and thus not logged
org.archive.net.UURIFactory.ignored-schemes = mailto, clsid, res, ftp, \ 
	file, rtsp, about

##############################################################################
# CrawlURI
##############################################################################
# Maximum links per page.
# Default is 6000. Links beyond the maximum are not scheduled. An annotation
# in crawl.log -- dol:N -- is added where N is number of links discarded
# (dol == Discarded OutLinks).
# org.archive.crawler.datamodel.CrawlURI.maxOutLinks = 6000

##############################################################################
# ExtractorHTML
##############################################################################
# Configuration for html extractor regex. 
# Without maximums, regexes can return matches that span megabytes
# in strange html with corresponding megabyte String allocations (See 
# https://sourceforge.net/tracker/?func=detail&atid=539099&aid=1220714&group_id=73833
# Set the maximum element name length.
# org.archive.crawler.extractor.ExtractorHTML.maxElementNameLength = 1024
# Set the maximum attribute name length.
# org.archive.crawler.extractor.ExtractorHTML.maxAttributeNameLength = 1024
# Set the maximum attribute value length.
# org.archive.crawler.extractor.ExtractorHTML.maxAttributeValueLength = 16384

##############################################################################
# Checkpointing
##############################################################################
# Have checkpointing activity show in logs.
org.archive.crawler.framework.Checkpointer.level = INFO
org.archive.crawler.selftest.CheckpointSelfTest.level = INFO
# Uncomment to run background thread that will checkpoint on the appointed
# period (in hours between checkpoints).
# org.archive.crawler.framework.Checkpointer.period = 4

##############################################################################
# Jetty / WebUI
##############################################################################
# Up Jetty default form-length limit to 50MB
system.org.mortbay.http.HttpRequest.maxFormContentSize = 52428800