www.pudn.com > heritrixProject.rar > heritrix.properties
##############################################################################
# H E R I T R I X P R O P E R T I E S
##############################################################################
# Properties with 'heritrix.' or 'org.archive.crawler.' prefix get loaded
# into System.properties on startup so available via System.getProperties.
# Version is filled in by the maven.xml pregoal. It copies here the project
# currentVersion property.
heritrix.version = 1.10.0
# Location of the heritrix jobs directory.
heritrix.jobsdir = jobs
# Default commandline startup values.
# Below values are used if unspecified on the command line.
heritrix.cmdline.admin = admin:letmein
heritrix.cmdline.port = 8080
heritrix.cmdline.run = false
heritrix.cmdline.nowui = false
heritrix.cmdline.order =
heritrix.cmdline.jmxserver = false
heritrix.cmdline.jmxserver.port = 8081
##############################################################################
# L O G G I N G #
##############################################################################
# Basic logging setup; to console, all levels.
# Note, after startup Heritrix adds a special logging handler called SinkHandler
# that keeps around all instances of severe and warning messages for display by
# the Heritrix alerting system. We used to initialize it here with the
# ConsoleHandler but it wasn't being found when Heritrix was packaged as a webapp
# deployed in Tomcat.
handlers = java.util.logging.ConsoleHandler
java.util.logging.ConsoleHandler.level = ALL
java.util.logging.ConsoleHandler.formatter= org.archive.util.OneLineSimpleLogger
# Default global logging level: only warnings or higher
.level= WARNING
# Enable frontier INFO logging
# org.archive.crawler.frontier.BdbFrontier.level = FINER
# org.archive.crawler.frontier.BdbMultipleWorkQueues.level = INFO
# org.archive.crawler.frontier.AbstractFrontier.level = INFO
# org.archive.crawler.frontier.WorkQueueFrontier.level = INFO
# Currently necessary (?) for standard logs to work
crawl.level= INFO
runtime-errors.level= INFO
local-errors.level= INFO
uri-errors.level= INFO
progress-statistics.level= INFO
recover.level= INFO
# Set the selftest console logger level.
org.archive.crawler.admin.SelftestCrawlJobHandler.level = INFO
# Enable info level on CrawlJob so can see it registered jobs w/ mbeanserver
org.archive.crawler.admin.CrawlJob.level = INFO
# Enable the following lines to watch authentications running.
# org.archive.crawler.fetcher.FetchHTTP.level = FINE
# org.archive.crawler.prefetch.PreconditionEnforcer.level = FINE
# HttpClient is too chatty... only want to hear about severe problems
# For more on httpclient logging,
# see http://jakarta.apache.org/commons/httpclient/logging.html
org.apache.commons.httpclient.level = SEVERE
# If you need verbose console logging of HttpClient traffic, uncomment the
# following line:
# httpclient.wire.level = FINE
# Enable ARCWriter.level line below to see logging of the opening and closing
# of arc files in console output (Console goes into heritrix_out.log). If you'd
# rather have console output, including the opening and closing of ARCs, go to
# a file, enable the FileHandler logger -- see the commented out 'handlers'
# line above -- and enable the lines below related to FileHandler (Configure to
# your preference -- the '%h' in the below means value of java.home system
# property, usually your home directory). There does not seem to be a way
# other than in code to configure only ARCWriter writing to the FileHandler.
# Even if you do do code changes, its awkward making the log show in each
# individual jobs log directory. Note, the closing of arc files on shutdown is
# not logged.
# org.archive.io.arc.ARCWriter.level = INFO
# Enable to watch authentication progress amongst other details on http fetch.
# org.archive.crawler.fetcher.FetchHTTP.level = FINE
# Enable logging of already seen memory grows; this growing is probably
# main culprit for OOMEs. Log goes into heritrix_out.log.
org.archive.util.MemLongFPSet.level = INFO
# org.archive.crawler.url.Canonicalizer.level = INFO
# Enable logging of uris rejected by scope by setting level to INFO.
# org.archive.crawler.postprocessor.LinksScoper.level = INFO
# Below values are used by classes that set their own FileHandler as default
# values. The pattern value is used as a suffix. Note, the below configuration
# limits log file size (to 500000 bytes) and because count is 1, old log data
# will be lost.
java.util.logging.FileHandler.level = ALL
java.util.logging.FileHandler.pattern = %u.log
java.util.logging.FileHandler.formatter = org.archive.util.OneLineSimpleLogger
java.util.logging.FileHandler.limit = 500000
java.util.logging.FileHandler.count = 1
# org.archive.crawler.admin.StatisticsTracker.level = INFO
# Enable info logging so can see note on checkpoint completion in
# heritrix_out.log.
org.archive.crawler.framework.CrawlController.level = INFO
# AdaptiveRevist module
# org.archive.crawler.frontier.AdaptiveRevisitFrontier.level = FINE
# org.archive.crawler.frontier.AdaptiveRevisitHostQueue.level = FINE
# org.archive.crawler.frontier.AdaptiveRevisitQueueList.level = FINE
# org.archive.crawler.processor.ChangeEvaluator.level = FINE
# org.archive.crawler.processor.WaitEvaluator.level = FINE
# org.archive.crawler.processor.HTTPContentDigest.level = FINE
# ServerCache creation of crawl host (Make it FINER if you want to see
# crawl server creation also).
# org.archive.crawler.datamodel.ServerCache.level = FINE
# Enable to see average alreadyseen lookup times.
# org.archive.crawler.util.BdbUriUniqFilter.level = INFO
# Enable to see duplicated-detection stats on each merge,
# if using an FPMergeUriUniqFilter
# org.archive.crawler.util.FPMergeUriUniqFilter.level = INFO
# Enable to see report of duplicates encountered every 50K unique adds
# org.archive.crawler.util.SetBasedUriUniqFilter.level = FINE
# Used to set a log of all URIs presented to filter, for
# later comparison profiling
# org.archive.crawler.util.SetBasedUriUniqFilter.profileLogFile = uriUniq.log
# Leave the below enabled so can see launch info messages on the console.
org.archive.crawler.Heritrix.level = INFO
# Leave RecoveryJournal level at INFO to see recovery progress
org.archive.crawler.frontier.RecoveryJournal.level = INFO
#############################################################################
# F R O N T I E R #
#############################################################################
# List here all queue assignment policies you'd have show as a
# queue-assignment-policy choice in AbstractFrontier derived Frontiers
# (e.g. BdbFrontier).
org.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy = \
org.archive.crawler.frontier.HostnameQueueAssignmentPolicy \
org.archive.crawler.frontier.IPQueueAssignmentPolicy \
org.archive.crawler.frontier.BucketQueueAssignmentPolicy \
org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy
org.archive.crawler.frontier.BdbFrontier.level = INFO
##############################################################################
# U U R I #
##############################################################################
# Any scheme not listed in the below will generate an UnsupportedUriScheme
# exception. Make the list empty to support all schemes.
org.archive.net.UURIFactory.schemes = http, https, dns, invalid
# 'invalid' is a fake scheme used as a last-ditch fallback when a UURI
# instance is expected and raising a URIException would be problematic (as
# in deserializing what was a valid UURI at serialization time). It may be
# removed, but doing so could reduce crawl robustness in the face of
# potential future bugs.
# following schemes will be marked as intentionally ignored in the
# exception -- and thus not logged
org.archive.net.UURIFactory.ignored-schemes = mailto, clsid, res, ftp, \
file, rtsp, about
##############################################################################
# C r a w l U R I #
##############################################################################
# Maximum links per page.
# Default is 6000. Links beyond the maximum are not scheduled. An annotation
# in crawl.log -- dol:N -- is added where N is number of links discarded
# (dol == Discarded OutLinks).
# org.archive.crawler.datamodel.CrawlURI.maxOutLinks = 6000
##############################################################################
# E x t r a c t o r H T M L #
##############################################################################
# Configuration for html extractor regex.
# Without maximums, regexes can return matches that span megabytes
# in strange html with corresponding megabyte String allocations (See
# https://sourceforge.net/tracker/?func=detail&atid=539099&aid=1220714&group_id=73833
# Set the maximum element name length.
# org.archive.crawler.extractor.ExtractorHTML.maxElementNameLength = 1024
# Set the maximum attribute name length.
# org.archive.crawler.extractor.ExtractorHTML.maxAttributeNameLength = 1024
# Set the maximum attribute value length.
# org.archive.crawler.extractor.ExtractorHTML.maxAttributeValueLength = 16384
##############################################################################
# C h e c k p o i n t i n g
##############################################################################
# Have checkpointing activity show in logs.
org.archive.crawler.framework.Checkpointer.level = INFO
org.archive.crawler.selftest.CheckpointSelfTest.level = INFO
# Uncomment to run background thread that will checkpoint on the appointed
# period (in hours between checkpoints).
# org.archive.crawler.framework.Checkpointer.period = 4