Documentation
¶
Index ¶
- Constants
- Variables
- func AddCookieToJar(urlAddr string, cookies ...*http.Cookie) func(s *Spider)
- func GetRequestHash(r *Request) [md5.Size]byte
- func Limiter(WhiteList bool, rules ...*LimitRule) func(s *Spider)
- func RandomProxy(p ...string) func(s *Spider)
- func RandomUserAgent() func(s *Spider)
- func RedisDistributed(ro *redis.Options, sName string, useDeduplicate bool, ...) func(s *Spider)
- func RedisReqDeduplicate(r *redis.Client, sName string) func(s *Spider)
- func RefererFiller() func(s *Spider)
- func ReqDeduplicate() func(s *Spider)
- func Retry(maxTimes int, okcode ...int) func(s *Spider)
- func RobotsTxt(baseUrl, ua string) func(s *Spider)
- func SaveItemsAsCSV(f *os.File) func(s *Spider)
- func SaveItemsAsJSON(f *os.File) func(s *Spider)
- func SetDepthFirst(d bool) func(s *Spider)
- func SpiderLogError(f *os.File) func(s *Spider)
- func SpiderLogPrint() func(s *Spider)
- type BaseDownloader
- type BaseScheduler
- type Context
- type CsvItem
- type CtxHandlerFun
- type Downloader
- type DownloaderErr
- type ErrorItem
- type JsonItem
- type LimitRule
- type LimitRuleAllow
- type Manager
- type RedisScheduler
- type Request
- func (s *Request) AddCookie(c *http.Cookie) *Request
- func (s *Request) AddParam(k, v string) *Request
- func (s *Request) GetBody() []byte
- func (s *Request) SetHeader(key, value string) *Request
- func (s *Request) SetParam(p map[string]string) *Request
- func (s *Request) SetProxy(p string) *Request
- func (s *Request) SetUA(ua string) *Request
- func (s *Request) WithMeta(k string, v interface{}) *Request
- type Response
- type Scheduler
- type Spider
- func (s *Spider) AddTask(request *Request, handlers ...CtxHandlerFun)
- func (s *Spider) OnAdd(fn func(ctx *Context, t *Task) *Task)
- func (s *Spider) OnError(fn func(ctx *Context, err error))
- func (s *Spider) OnFinish(fn func(s *Spider))
- func (s *Spider) OnHTML(selector string, fn func(ctx *Context, sel *goquery.Selection))
- func (s *Spider) OnItem(fn func(i interface{}) interface{})
- func (s *Spider) OnJSON(q string, fn func(ctx *Context, j gjson.Result))
- func (s *Spider) OnReq(fn func(ctx *Context, req *Request) *Request)
- func (s *Spider) OnResp(fn CtxHandlerFun)
- func (s *Spider) OnStart(fn func(s *Spider))
- func (s *Spider) Run()
- func (s *Spider) SetItemPoolSize(i int)
- func (s *Spider) SetTaskPoolSize(i int)
- func (s *Spider) Use(fn ...func(s *Spider))
- type Task
Constants ¶
const DeduplicateSuffix = "_deduplicate"
const ItemsSuffix = "_items"
const TasksSuffix = "_tasks"
Variables ¶
var D = NewBaseDownloader()
var Do = D.Do
var ErrRunFinishedSpider = errors.New("running a spider which is finished,you could recreate this spider and run the new one")
var GetReq = Get
Deprecated: will be remove at next major version
var Log = logging.MustGetLogger("goribot")
var PostReq = Post
Deprecated: will be remove at next major version
Functions ¶
func AddCookieToJar ¶ added in v0.1.5
AddCookieToJar is an extension add a cookie to downloader's cookie jar
func GetRequestHash ¶
GetRequestHash return a hash of url,header,cookie and body data from a request
func RandomProxy ¶ added in v0.1.1
RandomUserAgent is an extension can set random proxy url for new task
func RandomUserAgent ¶
func RandomUserAgent() func(s *Spider)
RandomUserAgent is an extension can set random User-Agent for new task
func RedisDistributed ¶ added in v0.1.0
func RedisReqDeduplicate ¶ added in v0.1.0
ReqDeduplicate is an extension can deduplicate new task based on redis to support distributed
func RefererFiller ¶
func RefererFiller() func(s *Spider)
RefererFiller is an extension can add Referer for new task
func ReqDeduplicate ¶
func ReqDeduplicate() func(s *Spider)
ReqDeduplicate is an extension can deduplicate new task
func SaveItemsAsCSV ¶ added in v0.1.1
SaveItemsAsCSV is a extension save items to a csv file
func SaveItemsAsJSON ¶ added in v0.1.1
SaveItemsAsCSV is a extension save items to a json file
func SetDepthFirst ¶ added in v0.1.0
SetDepthFirst is an extension change Scheduler DepthFirst setting
func SpiderLogError ¶ added in v0.1.1
SpiderLogError is a extension logs special or error response
func SpiderLogPrint ¶ added in v0.1.1
func SpiderLogPrint() func(s *Spider)
SpiderLogPrint is a extension print spider working status
Types ¶
type BaseDownloader ¶ added in v0.1.0
BaseDownloader is default downloader of goribot
func NewBaseDownloader ¶ added in v0.1.0
func NewBaseDownloader() *BaseDownloader
func (*BaseDownloader) AddMiddleware ¶ added in v0.1.2
type BaseScheduler ¶ added in v0.1.0
type BaseScheduler struct {
// DepthFirst sets push new tasks to the top of the queue
DepthFirst bool
// contains filtered or unexported fields
}
Scheduler is default scheduler of goribot
func NewBaseScheduler ¶ added in v0.1.0
func NewBaseScheduler(depthFirst bool) *BaseScheduler
func (*BaseScheduler) AddItem ¶ added in v0.1.0
func (s *BaseScheduler) AddItem(i interface{})
func (*BaseScheduler) AddTask ¶ added in v0.1.0
func (s *BaseScheduler) AddTask(t *Task)
func (*BaseScheduler) GetItem ¶ added in v0.1.0
func (s *BaseScheduler) GetItem() interface{}
func (*BaseScheduler) GetTask ¶ added in v0.1.0
func (s *BaseScheduler) GetTask() *Task
func (*BaseScheduler) IsItemEmpty ¶ added in v0.1.0
func (s *BaseScheduler) IsItemEmpty() bool
func (*BaseScheduler) IsTaskEmpty ¶ added in v0.1.0
func (s *BaseScheduler) IsTaskEmpty() bool
type Context ¶
type Context struct {
// Req is the origin request
Req *Request
// Resp is the response object
Resp *Response
// Meta the request task created by NewTaskWithMeta func will have a k-y pair
Meta map[string]interface{}
Handlers []CtxHandlerFun
// contains filtered or unexported fields
}
Context is a wrap of response,origin request,new task,etc
func (*Context) Abort ¶ added in v0.1.0
func (c *Context) Abort()
Abort this context to break the handler chain and stop handling
func (*Context) AddItem ¶
func (c *Context) AddItem(i interface{})
AddItem add an item to new item list. After every handler func return, spider will collect these items and call OnItem handler func
func (*Context) AddTask ¶
func (c *Context) AddTask(request *Request, handlers ...CtxHandlerFun)
AddTask add a task to new task list. After every handler func return,spider will collect these tasks
type CtxHandlerFun ¶ added in v0.1.0
type CtxHandlerFun func(ctx *Context)
type Downloader ¶ added in v0.1.0
type Downloader interface {
Do(req *Request) (resp *Response, err error)
AddMiddleware(func(req *Request, next func(req *Request) (resp *Response, err error)) (resp *Response, err error))
}
Downloader tool download response from request
type DownloaderErr ¶ added in v0.1.0
type DownloaderErr struct {
// Request is the Request object when the error occurred
Request *Request
// Response is the Request object when the error occurred.It could be nil.
Response *Response
// contains filtered or unexported fields
}
DownloaderErr is a error create by Downloader
type LimitRule ¶ added in v0.1.1
type LimitRuleAllow ¶ added in v0.1.1
type LimitRuleAllow uint8
const ( NotSet LimitRuleAllow = iota Allow Disallow )
type Manager ¶ added in v0.1.0
type Manager struct {
// contains filtered or unexported fields
}
func (*Manager) OnItem ¶ added in v0.1.0
func (s *Manager) OnItem(fn func(i interface{}) interface{})
func (*Manager) SetItemPoolSize ¶ added in v0.1.0
type RedisScheduler ¶ added in v0.1.0
type RedisScheduler struct {
// contains filtered or unexported fields
}
Scheduler is default scheduler of goribot
func NewRedisScheduler ¶ added in v0.1.0
func NewRedisScheduler(redis *redis.Client, sName string, bs int, fn ...CtxHandlerFun) *RedisScheduler
func (*RedisScheduler) AddItem ¶ added in v0.1.0
func (s *RedisScheduler) AddItem(i interface{})
func (*RedisScheduler) AddTask ¶ added in v0.1.0
func (s *RedisScheduler) AddTask(t *Task)
func (*RedisScheduler) GetItem ¶ added in v0.1.0
func (s *RedisScheduler) GetItem() interface{}
func (*RedisScheduler) GetTask ¶ added in v0.1.0
func (s *RedisScheduler) GetTask() *Task
func (*RedisScheduler) IsItemEmpty ¶ added in v0.1.0
func (s *RedisScheduler) IsItemEmpty() bool
func (*RedisScheduler) IsTaskEmpty ¶ added in v0.1.0
func (s *RedisScheduler) IsTaskEmpty() bool
type Request ¶
type Request struct {
*http.Request
Depth int
// ResponseCharacterEncoding is the character encoding of the response body.
// Leave it blank to allow automatic character encoding of the response body.
// It is empty by default and it can be set in OnRequest callback.
ResponseCharacterEncoding string
// ProxyURL is the proxy address that handles the request
ProxyURL string
// Meta contains data between a Request and a Response
Meta map[string]interface{}
Err error
// contains filtered or unexported fields
}
Request is a object of HTTP request
func PostFormReq ¶ added in v0.1.0
PostFormReq creates a post request with form data
func PostJsonReq ¶ added in v0.1.0
PostJsonReq creates a post request with json data
func PostRawReq ¶ added in v0.1.0
PostReq creates a post request with raw data
func (*Request) SetHeader ¶
SetHeader sets the header entries associated with key to the single element value.
func (*Request) SetParam ¶ added in v0.1.0
SetParam sets query param of request url. Deprecated: will be remove at next major version
type Response ¶
type Response struct {
*http.Response
// Body is the content of the Response
Body []byte
// Text is the content of the Response parsed as string
Text string
// Request is the Req object from goribot of the response.Tip: there is another Request attr come from *http.Response
Req *Request
// Dom is the parsed html object
Dom *goquery.Document
// Meta contains data between a Request and a Response
Meta map[string]interface{}
}
Response is a object of HTTP response
func (*Response) DecodeAndParse ¶ added in v0.1.0
DecodeAndParas decodes the body to text and try to parse it to html or json.
type Scheduler ¶ added in v0.1.0
type Scheduler interface {
// GetTask pops a task
GetTask() *Task
// GetItem pops a item
GetItem() interface{}
// AddTask push a task
AddTask(t *Task)
// AddItem push a item
AddItem(i interface{})
// IsTaskEmpty returns is tasks queue empty
IsTaskEmpty() bool
// IsItemEmpty returns is items queue empty
IsItemEmpty() bool
}
Scheduler is a queue of tasks and items
type Spider ¶
type Spider struct {
Scheduler Scheduler
Downloader Downloader
AutoStop bool
// contains filtered or unexported fields
}
func (*Spider) AddTask ¶
func (s *Spider) AddTask(request *Request, handlers ...CtxHandlerFun)
func (*Spider) OnAdd ¶ added in v0.1.0
***********************************************************************************
func (*Spider) OnError ¶
***********************************************************************************
func (*Spider) OnFinish ¶ added in v0.1.0
***********************************************************************************
func (*Spider) OnItem ¶
func (s *Spider) OnItem(fn func(i interface{}) interface{})
***********************************************************************************
func (*Spider) OnReq ¶ added in v0.1.0
***********************************************************************************
func (*Spider) OnResp ¶
func (s *Spider) OnResp(fn CtxHandlerFun)
***********************************************************************************
func (*Spider) OnStart ¶ added in v0.1.0
***********************************************************************************
func (*Spider) SetItemPoolSize ¶ added in v0.1.0
func (*Spider) SetTaskPoolSize ¶ added in v0.1.0
type Task ¶
type Task struct {
Request *Request
Handlers []CtxHandlerFun
}
func NewTask ¶
func NewTask(request *Request, handlers ...CtxHandlerFun) *Task