Documentation
ΒΆ
Index ΒΆ
- Constants
- Variables
- func ExtractAndClean(htmlContent string) (string, error)
- func ExtractText(htmlContent string) (string, error)
- func ExtractTitle(htmlContent string) (string, error)
- func ExtractToJSON(htmlContent string) ([]byte, error)
- func ExtractToMarkdown(htmlContent string) (string, error)
- func ExtractWithTitle(htmlContent string) (string, string, error)
- func GetReadingTime(htmlContent string) (float64, error)
- func GetWordCount(htmlContent string) (int, error)
- func GroupLinksByType(links []LinkResource) map[string][]LinkResource
- func Summarize(htmlContent string, maxWords int) (string, error)
- type Attribute
- type AudioInfo
- type Config
- type ExtractConfig
- type ImageInfo
- type LinkExtractionConfig
- type LinkInfo
- type LinkResource
- type Node
- type NodeType
- type Processor
- func (p *Processor) ClearCache()
- func (p *Processor) Close() error
- func (p *Processor) Extract(htmlContent string, configs ...ExtractConfig) (*Result, error)
- func (p *Processor) ExtractAllLinks(htmlContent string, configs ...LinkExtractionConfig) ([]LinkResource, error)
- func (p *Processor) ExtractBatch(htmlContents []string, configs ...ExtractConfig) ([]*Result, error)
- func (p *Processor) ExtractBatchFiles(filePaths []string, configs ...ExtractConfig) ([]*Result, error)
- func (p *Processor) ExtractFromFile(filePath string, configs ...ExtractConfig) (*Result, error)
- func (p *Processor) ExtractWithDefaults(htmlContent string) (*Result, error)
- func (p *Processor) GetStatistics() Statistics
- type Result
- type Statistics
- type Token
- type TokenType
- type Tokenizer
- type VideoInfo
Constants ΒΆ
View Source
const ( ErrorNode = html.ErrorNode TextNode = html.TextNode DocumentNode = html.DocumentNode ElementNode = html.ElementNode CommentNode = html.CommentNode DoctypeNode = html.DoctypeNode ErrorToken = html.ErrorToken TextToken = html.TextToken StartTagToken = html.StartTagToken EndTagToken = html.EndTagToken SelfClosingTagToken = html.SelfClosingTagToken CommentToken = html.CommentToken DoctypeToken = html.DoctypeToken )
View Source
const ( DefaultMaxInputSize = 50 * 1024 * 1024 DefaultMaxCacheEntries = 1000 DefaultWorkerPoolSize = 4 DefaultCacheTTL = time.Hour DefaultMaxDepth = 100 DefaultProcessingTimeout = 30 * time.Second )
Variables ΒΆ
View Source
var ( // ErrInputTooLarge is returned when input exceeds MaxInputSize. ErrInputTooLarge = errors.New("html: input size exceeds maximum") // ErrInvalidHTML is returned when HTML parsing fails. ErrInvalidHTML = errors.New("html: invalid HTML") // ErrProcessorClosed is returned when operations are attempted on a closed processor. ErrProcessorClosed = errors.New("html: processor closed") // ErrMaxDepthExceeded is returned when HTML nesting exceeds MaxDepth. ErrMaxDepthExceeded = errors.New("html: max depth exceeded") // ErrInvalidConfig is returned when configuration validation fails. ErrInvalidConfig = errors.New("html: invalid config") // ErrProcessingTimeout is returned when processing exceeds ProcessingTimeout. ErrProcessingTimeout = errors.New("html: processing timeout exceeded") // ErrFileNotFound is returned when specified file cannot be read. ErrFileNotFound = errors.New("html: file not found") )
Error definitions for the `cybergodev/html` package.
View Source
var ( Parse = html.Parse ParseFragment = html.ParseFragment Render = html.Render EscapeString = html.EscapeString UnescapeString = html.UnescapeString NewTokenizer = html.NewTokenizer )
Functions ΒΆ
func ExtractAndClean ΒΆ added in v1.0.4
func ExtractText ΒΆ added in v1.0.2
ExtractText extracts only text content without metadata.
func ExtractTitle ΒΆ added in v1.0.4
func ExtractToJSON ΒΆ added in v1.0.4
func ExtractToMarkdown ΒΆ added in v1.0.4
func ExtractWithTitle ΒΆ added in v1.0.4
func GetReadingTime ΒΆ added in v1.0.4
func GetWordCount ΒΆ added in v1.0.4
func GroupLinksByType ΒΆ added in v1.0.2
func GroupLinksByType(links []LinkResource) map[string][]LinkResource
GroupLinksByType groups LinkResource slice by their Type field.
Types ΒΆ
type AudioInfo ΒΆ
func ExtractAudios ΒΆ added in v1.0.4
type Config ΒΆ
type Config struct {
MaxInputSize int
MaxCacheEntries int
CacheTTL time.Duration
WorkerPoolSize int
EnableSanitization bool
MaxDepth int
ProcessingTimeout time.Duration
}
func DefaultConfig ΒΆ
func DefaultConfig() Config
type ExtractConfig ΒΆ
type ExtractConfig struct {
ExtractArticle bool
PreserveImages bool
PreserveLinks bool
PreserveVideos bool
PreserveAudios bool
InlineImageFormat string
}
func ConfigForMarkdown ΒΆ added in v1.0.4
func ConfigForMarkdown() ExtractConfig
func ConfigForRSS ΒΆ added in v1.0.4
func ConfigForRSS() ExtractConfig
func ConfigForSearchIndex ΒΆ added in v1.0.4
func ConfigForSearchIndex() ExtractConfig
func ConfigForSummary ΒΆ added in v1.0.4
func ConfigForSummary() ExtractConfig
func DefaultExtractConfig ΒΆ
func DefaultExtractConfig() ExtractConfig
type ImageInfo ΒΆ
type ImageInfo struct {
URL string
Alt string
Title string
Width string
Height string
IsDecorative bool
Position int
}
func ExtractImages ΒΆ added in v1.0.4
type LinkExtractionConfig ΒΆ added in v1.0.2
type LinkExtractionConfig struct {
ResolveRelativeURLs bool
BaseURL string
IncludeImages bool
IncludeVideos bool
IncludeAudios bool
IncludeCSS bool
IncludeJS bool
IncludeContentLinks bool
IncludeExternalLinks bool
IncludeIcons bool
}
func DefaultLinkExtractionConfig ΒΆ added in v1.0.2
func DefaultLinkExtractionConfig() LinkExtractionConfig
type LinkInfo ΒΆ
func ExtractLinks ΒΆ added in v1.0.4
type LinkResource ΒΆ added in v1.0.2
func ExtractAllLinks ΒΆ added in v1.0.2
func ExtractAllLinks(htmlContent string, configs ...LinkExtractionConfig) ([]LinkResource, error)
type Processor ΒΆ
type Processor struct {
// contains filtered or unexported fields
}
func NewWithDefaults ΒΆ
func NewWithDefaults() *Processor
func (*Processor) ClearCache ΒΆ
func (p *Processor) ClearCache()
func (*Processor) Extract ΒΆ
func (p *Processor) Extract(htmlContent string, configs ...ExtractConfig) (*Result, error)
func (*Processor) ExtractAllLinks ΒΆ added in v1.0.2
func (p *Processor) ExtractAllLinks(htmlContent string, configs ...LinkExtractionConfig) ([]LinkResource, error)
func (*Processor) ExtractBatch ΒΆ
func (p *Processor) ExtractBatch(htmlContents []string, configs ...ExtractConfig) ([]*Result, error)
func (*Processor) ExtractBatchFiles ΒΆ
func (p *Processor) ExtractBatchFiles(filePaths []string, configs ...ExtractConfig) ([]*Result, error)
func (*Processor) ExtractFromFile ΒΆ
func (p *Processor) ExtractFromFile(filePath string, configs ...ExtractConfig) (*Result, error)
func (*Processor) ExtractWithDefaults ΒΆ
func (*Processor) GetStatistics ΒΆ
func (p *Processor) GetStatistics() Statistics
type Result ΒΆ
type Result struct {
Text string
Title string
Images []ImageInfo
Links []LinkInfo
Videos []VideoInfo
Audios []AudioInfo
ProcessingTime time.Duration
WordCount int
ReadingTime time.Duration
}
func Extract ΒΆ added in v1.0.2
func Extract(htmlContent string, configs ...ExtractConfig) (*Result, error)
func ExtractFromFile ΒΆ added in v1.0.2
func ExtractFromFile(filePath string, configs ...ExtractConfig) (*Result, error)
type Statistics ΒΆ
Click to show internal directories.
Click to hide internal directories.