Documentation
¶
Index ¶
- Constants
- Variables
- func CrawlProcess(taskChannel chan struct{}, e *Engine, task *Task)
- func ParseHTML(parser HTMLParser, ctx *Context) error
- func RandomIntRangeWithStringSeed(min int, max int, seedString string) int
- func ReadListFile(listFilePath string) ([]string, error)
- func RequestWithURL(task *Task, middlewares ...Middleware) (io.Reader, error)
- type ChannelPipeline
- type ChannelPipelineToken
- type Context
- type CookieMiddleware
- type CookieMiddlewareOption
- type CookieStore
- type DefaultCookieStore
- type DefaultItem
- func (i *DefaultItem) GetFloat64(key string) (float64, error)
- func (i *DefaultItem) GetInt(key string) (int, error)
- func (i *DefaultItem) GetString(key string) (string, error)
- func (i DefaultItem) GetToken() string
- func (i *DefaultItem) GetValue(key string) (interface{}, error)
- func (i *DefaultItem) SetValue(key string, value interface{})
- type DelayMiddleware
- type Engine
- func (e *Engine) AddHTMLParser(parsers ...HTMLParser)
- func (e *Engine) AddPipelines(pipelines ...Pipeline)
- func (e *Engine) AddPlugins(plugins ...Plugin)
- func (e *Engine) AddPostProcess(postprocessList ...PostProcess)
- func (e *Engine) AddTasks(tasks ...*Task)
- func (e *Engine) AddURLs(urls ...string)
- func (e *Engine) Run(wg *sync.WaitGroup)
- func (e *Engine) RunAndWait()
- func (e *Engine) UseMiddleware(middlewares ...Middleware)
- func (e *Engine) UseTaskPool(taskPool TaskPool)
- type EngineOption
- type GlobalStore
- type GlobalStorePipeline
- type HTMLParser
- type ImageDownloadItem
- type ImageDownloadPipeline
- type MemoryGlobalStore
- type Middleware
- type OutputCSVPostProcess
- type OutputCSVPostProcessOption
- type OutputJsonPostProcess
- type Pipeline
- type Plugin
- type PostProcess
- type ProxyMiddleware
- type ProxyMiddlewareOption
- type RequestPool
- func (p *RequestPool) AddTasks(tasks ...*Task)
- func (p *RequestPool) AddURLs(urls ...string)
- func (p *RequestPool) Close()
- func (p *RequestPool) GetCompleteCount() (int, error)
- func (p *RequestPool) GetDoneChan() chan struct{}
- func (p *RequestPool) GetOneTask(e *Engine) <-chan *Task
- func (p *RequestPool) GetTotal() (int, error)
- func (p *RequestPool) GetUnRequestCount() (int, error)
- func (p *RequestPool) GetUnRequestedTask() (target *Task)
- func (p *RequestPool) OnTaskDone(task *Task)
- func (p *RequestPool) SetPrevent(isPrevent bool)
- type RequestPoolOption
- type StatusOutputPlugin
- type Task
- type TaskPool
- type UserAgentMiddleware
- type UserAgentMiddlewareOption
Constants ¶
View Source
const ( // total STATUS_KEY_TOTAL = "status.total" // unrequested count STATUS_KEY_UNREQUESTED = "status.unrequested" // complete count STATUS_KEY_COMPLETE = "status.complete" // speed STATUS_KEY_SPEED = "status.speed" )
View Source
const (
ItemKeyChannelToken = "channelToken"
)
Variables ¶
View Source
var ( KeyNotContainError = errors.New("key not in item") TypeError = errors.New("error type of item value") )
View Source
var EngineLogger *logrus.Entry = logrus.WithField("scope", "engine")
Functions ¶
func CrawlProcess ¶
func ReadListFile ¶
func RequestWithURL ¶
func RequestWithURL(task *Task, middlewares ...Middleware) (io.Reader, error)
make request with url
Types ¶
type ChannelPipeline ¶
func (*ChannelPipeline) Process ¶
func (p *ChannelPipeline) Process(item interface{}, _ GlobalStore) error
type ChannelPipelineToken ¶
type ChannelPipelineToken interface {
GetToken() string
}
type Context ¶
type Context struct {
Request *http.Request
Response *http.Response
Item interface{}
GlobalStore GlobalStore
Pool TaskPool
Cookie *cookiejar.Jar
Doc *goquery.Document
}
share data in crawl process
type CookieMiddleware ¶
type CookieMiddleware struct {
Store CookieStore
GetKey func(c *http.Client, r *http.Request, ctx *Context) string
}
func NewCookieMiddleware ¶
func NewCookieMiddleware(option CookieMiddlewareOption) *CookieMiddleware
func (*CookieMiddleware) RequestCallback ¶
type CookieMiddlewareOption ¶
type CookieStore ¶
type DefaultCookieStore ¶
func (*DefaultCookieStore) GetCookie ¶
func (s *DefaultCookieStore) GetCookie(key string) *cookiejar.Jar
func (*DefaultCookieStore) GetOrCreate ¶
func (s *DefaultCookieStore) GetOrCreate(key string) *cookiejar.Jar
type DefaultItem ¶
type DefaultItem struct {
Store map[string]interface{}
}
func (*DefaultItem) GetFloat64 ¶
func (i *DefaultItem) GetFloat64(key string) (float64, error)
func (DefaultItem) GetToken ¶
func (i DefaultItem) GetToken() string
func (*DefaultItem) GetValue ¶
func (i *DefaultItem) GetValue(key string) (interface{}, error)
func (*DefaultItem) SetValue ¶
func (i *DefaultItem) SetValue(key string, value interface{})
type Engine ¶
type Engine struct {
sync.Mutex
*EngineOption
// dispatch task
Pool TaskPool
Parsers []HTMLParser
Middlewares []Middleware
Pipelines []Pipeline
GlobalStore GlobalStore
PostProcess []PostProcess
Plugins []Plugin
// receive signal: force stop pool
InterruptChan chan struct{}
// receive signal: stop pool when all task has done
StopPoolChan chan struct{}
}
youcrawl engine
func (*Engine) AddPostProcess ¶
func (e *Engine) AddPostProcess(postprocessList ...PostProcess)
add postprocess
func (*Engine) AddTasks ¶
add task to crawl unsafe operation,engine must not in running status
in engine running ,use RequestPool.AddURLs method
func (*Engine) AddURLs ¶
add url to crawl unsafe operation,engine must not in running status
in engine running ,use RequestPool.AddURLs method
func (*Engine) UseMiddleware ¶
func (e *Engine) UseMiddleware(middlewares ...Middleware)
add middleware
type EngineOption ¶
type EngineOption struct {
// max running in same time
MaxRequest int
// true for:
// keep running until manually stopped
Daemon bool
}
init engine config
type GlobalStore ¶
type GlobalStore interface {
Init() error
SetValue(key string, value interface{})
GetValue(key string) interface{}
GetOrCreate(key string, value interface{}) interface{}
}
store engine global
type GlobalStorePipeline ¶
type GlobalStorePipeline struct {
}
global store pipeline save current item to global items
func (*GlobalStorePipeline) Process ¶
func (g *GlobalStorePipeline) Process(item interface{}, store GlobalStore) error
type HTMLParser ¶
type ImageDownloadItem ¶
type ImageDownloadItem struct {
Urls []string
}
type ImageDownloadPipeline ¶
type ImageDownloadPipeline struct {
// get store folder
//
//./download/image by default
GetStoreFileFolder func(item interface{}, store GlobalStore) string
// get save filename
//
// same name with image,by default
GetSaveFileName func(item interface{}, store GlobalStore, rawURL string) string
// get urls
//
//if the type of Item is ImageDownloadItem, no need to specify
GetUrls func(item interface{}, store GlobalStore) []string
// maximum number of concurrent downloads
MaxDownload int
// request middlewares to use
Middlewares []Middleware
// call on each image downloaded complete
OnImageDownloadComplete func(item interface{}, store GlobalStore, url string, downloadFilePath string)
// call on all image download, regardless of whether all image download is successful
OnDone func(item interface{}, store GlobalStore)
}
func (*ImageDownloadPipeline) Process ¶
func (i *ImageDownloadPipeline) Process(item interface{}, store GlobalStore) error
type MemoryGlobalStore ¶
func (*MemoryGlobalStore) GetOrCreate ¶
func (s *MemoryGlobalStore) GetOrCreate(key string, value interface{}) interface{}
func (*MemoryGlobalStore) GetValue ¶
func (s *MemoryGlobalStore) GetValue(key string) interface{}
func (*MemoryGlobalStore) Init ¶
func (s *MemoryGlobalStore) Init() error
func (*MemoryGlobalStore) SetValue ¶
func (s *MemoryGlobalStore) SetValue(key string, value interface{})
type Middleware ¶
type OutputCSVPostProcess ¶
type OutputCSVPostProcess struct {
// contains filtered or unexported fields
}
func NewOutputCSVPostProcess ¶
func NewOutputCSVPostProcess(option OutputCSVPostProcessOption) *OutputCSVPostProcess
func (*OutputCSVPostProcess) Process ¶
func (o *OutputCSVPostProcess) Process(store GlobalStore) error
type OutputCSVPostProcessOption ¶
type OutputCSVPostProcessOption struct {
// output path.
// if not provided,use `./output.csv` as default value
OutputPath string
// with header.
// default : false
WithHeader bool
// key to write
// if not provided,will write all key
Keys []string
// key to csv column name.
// if not provide,use key name as csv column name
KeysMapping map[string]string
// if value not exist in item.
// by default,use empty string
NotExistValue string
}
type OutputJsonPostProcess ¶
type OutputJsonPostProcess struct {
StorePath string
GetData func(store GlobalStore) interface{}
}
func (*OutputJsonPostProcess) Process ¶
func (p *OutputJsonPostProcess) Process(store GlobalStore) error
type Pipeline ¶
type Pipeline interface {
Process(item interface{}, store GlobalStore) error
}
type PostProcess ¶
type PostProcess interface {
Process(store GlobalStore) error
}
type ProxyMiddleware ¶
type ProxyMiddleware struct {
List []string
}
func NewProxyMiddleware ¶
func NewProxyMiddleware(option ProxyMiddlewareOption) (*ProxyMiddleware, error)
func (*ProxyMiddleware) GetProxy ¶
func (p *ProxyMiddleware) GetProxy() string
func (*ProxyMiddleware) RequestCallback ¶
type ProxyMiddlewareOption ¶
type RequestPool ¶
type RequestPool struct {
Tasks []Task
Total int
CompleteCount int
NextTask *Task
GetTaskChan chan *Task
DoneChan chan struct{}
CompleteChan chan *Task
PreventStop bool
Store GlobalStore
sync.RWMutex
}
func NewRequestPool ¶
func NewRequestPool(option RequestPoolOption, store GlobalStore) *RequestPool
func (*RequestPool) Close ¶
func (p *RequestPool) Close()
func (*RequestPool) GetCompleteCount ¶
func (p *RequestPool) GetCompleteCount() (int, error)
func (*RequestPool) GetDoneChan ¶
func (p *RequestPool) GetDoneChan() chan struct{}
func (*RequestPool) GetOneTask ¶
func (p *RequestPool) GetOneTask(e *Engine) <-chan *Task
func (*RequestPool) GetTotal ¶
func (p *RequestPool) GetTotal() (int, error)
func (*RequestPool) GetUnRequestCount ¶
func (p *RequestPool) GetUnRequestCount() (int, error)
func (*RequestPool) GetUnRequestedTask ¶
func (p *RequestPool) GetUnRequestedTask() (target *Task)
find unreauested task
func (*RequestPool) OnTaskDone ¶
func (p *RequestPool) OnTaskDone(task *Task)
get task from pool task
func (*RequestPool) SetPrevent ¶
func (p *RequestPool) SetPrevent(isPrevent bool)
type RequestPoolOption ¶
type StatusOutputPlugin ¶
type StatusOutputPlugin struct {
// disable log output
LogOutput bool
}
log engine status plugin
func (*StatusOutputPlugin) Run ¶
func (p *StatusOutputPlugin) Run(e *Engine)
type TaskPool ¶
type TaskPool interface {
AddURLs(urls ...string)
AddTasks(task ...*Task)
GetOneTask(e *Engine) <-chan *Task
GetUnRequestedTask() (target *Task)
OnTaskDone(task *Task)
GetDoneChan() chan struct{}
Close()
SetPrevent(isPrevent bool)
GetTotal() (int, error)
GetUnRequestCount() (int, error)
GetCompleteCount() (int, error)
}
type UserAgentMiddleware ¶
type UserAgentMiddleware struct {
List []string
}
func NewUserAgentMiddleware ¶
func NewUserAgentMiddleware(option UserAgentMiddlewareOption) (*UserAgentMiddleware, error)
func (*UserAgentMiddleware) GetUserAgent ¶
func (p *UserAgentMiddleware) GetUserAgent() string
func (*UserAgentMiddleware) RequestCallback ¶
type UserAgentMiddlewareOption ¶
type UserAgentMiddlewareOption struct {
// set user agent list,
// if both UserAgentList and UserAgentFilePath are provided,combine tow list
UserAgentList []string
// read useragent from file,use `./ua.txt` by default,
// if both UserAgentList and UserAgentFilePath are provided,combine tow list
UserAgentFilePath string
}
Click to show internal directories.
Click to hide internal directories.
