Documentation
¶
Index ¶
- Constants
- Variables
- type Crawler
- type CrawlerQueue
- type Decoder
- type Document
- type DocumentType
- type FTPProto
- type FileProto
- type GzipDecoder
- type HTMLDecoder
- type HTTPProto
- type ImageDecoder
- type MediaDecoder
- type PDFDecoder
- type Proto
- type SFTPProto
- type Settings
- type StdCrawlerQueue
- type TarDecoder
- type ZIPDecoder
Constants ¶
View Source
const ( DocumentFile = DocumentType(iota) DocumentDirectory DocumentLink DocumentSpecial )
Variables ¶
View Source
var ( DefaultSettings = Settings{ Protos: map[string]Proto{ "http": HTTPProto{}, "https": HTTPProto{}, "ftp": NewFTPProto(), "sftp": NewSFTPProto(), }, Decoders: map[string]Decoder{ "text/html": DefaultHTMLDecoder, "application/xhtml+xml": DefaultHTMLDecoder, "application/pdf": DefaultPDFDecoder, "image/png": DefaultImageDecoder, "image/jpeg": DefaultImageDecoder, "image/gif": DefaultImageDecoder, "video/webm": DefaultMediaDecoder, "audio/mpeg": DefaultMediaDecoder, "application/ogg": DefaultMediaDecoder, "application/zip": DefaultZIPDecoder, "application/x-gzip": DefaultGzipDecoder, }, } DefaultCrawler = Crawler{ Settings: DefaultSettings, Queue: make(StdCrawlerQueue), Set: *set.New(), Output: make(chan Document), } )
View Source
var ( DefaultHTMLDecoder = HTMLDecoder{} DefaultPDFDecoder = PDFDecoder{} DefaultImageDecoder = ImageDecoder{} DefaultMediaDecoder = MediaDecoder{} DefaultZIPDecoder = ZIPDecoder{} DefaultGzipDecoder = GzipDecoder{} )
Functions ¶
This section is empty.
Types ¶
type Crawler ¶
type Crawler struct {
Settings
Queue CrawlerQueue
Set set.Set
Output chan Document
ErrC chan error
CloseC chan struct{}
}
func NewCrawler ¶
func NewCrawler() *Crawler
type CrawlerQueue ¶
type CrawlerQueue interface {
// Send sends the list of urls in given order to the queue
Send(urls ...string)
// Recv receives one url from the queue and returns it. It may block.
Recv() (url string)
}
A CrawlerQueue is in most cases the same as a channel to send and receive strings, but provides two methods instead for external queue systems like Redis and RabbitMq
type Document ¶
type Document struct {
URL *url.URL
Type DocumentType
ContentType string
Time time.Time
Size int64
Links []string
Title string
Version string
Album string
Artist string
Performer string
Copyright string
License string
Organisation string
Genre string
Date string
ISRC string
Author string
Description string
Content string
NoIndex bool
NoFollow bool
}
type DocumentType ¶
type DocumentType int
type FTPProto ¶
type FTPProto struct {
// contains filtered or unexported fields
}
func NewFTPProto ¶
func NewFTPProto() *FTPProto
type GzipDecoder ¶
type GzipDecoder struct {
Tar TarDecoder
}
func (GzipDecoder) Decode ¶
func (g GzipDecoder) Decode(doc *Document, rc io.ReadCloser) error
type HTMLDecoder ¶
type HTMLDecoder struct{}
func (HTMLDecoder) Decode ¶
func (h HTMLDecoder) Decode(doc *Document, rc io.ReadCloser) error
type ImageDecoder ¶
type ImageDecoder struct{}
func (ImageDecoder) Decode ¶
func (i ImageDecoder) Decode(doc *Document, rc io.ReadCloser) error
type MediaDecoder ¶
type MediaDecoder struct{}
func (MediaDecoder) Decode ¶
func (m MediaDecoder) Decode(doc *Document, rc io.ReadCloser) error
type PDFDecoder ¶
type PDFDecoder struct{}
func (PDFDecoder) Decode ¶
func (p PDFDecoder) Decode(doc *Document, rc io.ReadCloser) error
type SFTPProto ¶
type SFTPProto struct {
// contains filtered or unexported fields
}
func NewSFTPProto ¶
func NewSFTPProto() *SFTPProto
type StdCrawlerQueue ¶
type StdCrawlerQueue chan string
StdCrawlerQueue is a string channel with methods required by CrawlerQueue
func (StdCrawlerQueue) Recv ¶
func (s StdCrawlerQueue) Recv() string
Recv receives one string and is just a wrapper for <-c, but is needed to fulfill the CrawlerQueue interface.
func (StdCrawlerQueue) Send ¶
func (s StdCrawlerQueue) Send(urls ...string)
Send sends the urls to the string channel. It's just a wrapper for c <- url, but is needed to fulfill the CrawlerQueue interface.
type TarDecoder ¶
type TarDecoder struct{}
func (TarDecoder) Decode ¶
func (t TarDecoder) Decode(doc *Document, rc io.ReadCloser) error
type ZIPDecoder ¶
type ZIPDecoder struct{}
func (ZIPDecoder) Decode ¶
func (z ZIPDecoder) Decode(doc *Document, rc io.ReadCloser) error
Click to show internal directories.
Click to hide internal directories.