processor

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 29, 2026 License: MIT Imports: 18 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DefaultIgnoreList = map[string]bool{
	".git":         true,
	"node_modules": true,
	"vendor":       true,
	".idea":        true,
	".vscode":      true,
	"__pycache__":  true,
	".DS_Store":    true,
}

DefaultIgnoreList contains directory names to skip during crawling.

View Source
var SupportedExtensions = []string{
	".txt", ".md", ".json", ".jsonl", ".csv", ".tsv", ".yaml", ".yml",
	".pdf", ".docx", ".xlsx", ".html", ".htm", ".sql",
	".odt", ".ods", ".odp", ".rtf", ".xml",
}

SupportedExtensions contains file extensions to process.

Functions

This section is empty.

Types

type CSVExtractor

type CSVExtractor struct {
	Separator rune
}

CSVExtractor handles CSV and TSV by converting rows to labeled strings.

func (*CSVExtractor) Extract

func (e *CSVExtractor) Extract(path string) (string, error)

type Chunk

type Chunk struct {
	Content string
	Index   int
}

Chunk represents a piece of text with metadata.

type Chunker

type Chunker struct {
	Size    int // Number of words per chunk
	Overlap int // Number of overlapping words
}

Chunker handles splitting text into overlapping windows.

func NewChunker

func NewChunker(size, overlap int) *Chunker

NewChunker creates a new Chunker instance.

func (*Chunker) ChunkText

func (c *Chunker) ChunkText(text string) []Chunk

ChunkText splits the input string into overlapping chunks based on word count.

type ContentExtractor

type ContentExtractor interface {
	Extract(path string) (string, error)
}

ContentExtractor defines the interface for extracting text from various file formats.

type DocxExtractor

type DocxExtractor struct{}

DocxExtractor handles .docx files using nguyenthenguyen/docx.

func (*DocxExtractor) Extract

func (e *DocxExtractor) Extract(path string) (string, error)

type ExtractorRegistry

type ExtractorRegistry struct {
	// contains filtered or unexported fields
}

ExtractorRegistry maps file extensions to their respective extractors.

func NewExtractorRegistry

func NewExtractorRegistry() *ExtractorRegistry

NewExtractorRegistry initializes the registry with all supported extractors.

func (*ExtractorRegistry) Get

func (*ExtractorRegistry) Register

func (r *ExtractorRegistry) Register(ext string, e ContentExtractor)

type FileResult

type FileResult struct {
	Path    string
	Content string
}

FileResult represents a discovered file and its content.

type HTMLExtractor

type HTMLExtractor struct{}

HTMLExtractor handles HTML files using goquery.

func (*HTMLExtractor) Extract

func (e *HTMLExtractor) Extract(path string) (string, error)

type JSONExtractor

type JSONExtractor struct{}

JSONExtractor handles JSON and JSONL by extracting string values or pretty-printing.

func (*JSONExtractor) Extract

func (e *JSONExtractor) Extract(path string) (string, error)

type ODFExtractor

type ODFExtractor struct{}

ODFExtractor handles OpenDocument files (.odt, .ods, .odp).

func (*ODFExtractor) Extract

func (e *ODFExtractor) Extract(path string) (string, error)

type PDFExtractor

type PDFExtractor struct{}

PDFExtractor handles PDF files using ledongthuc/pdf.

func (*PDFExtractor) Extract

func (e *PDFExtractor) Extract(path string) (string, error)

type RTFExtractor

type RTFExtractor struct{}

RTFExtractor handles Rich Text Format using J45k4/rtf.

func (*RTFExtractor) Extract

func (e *RTFExtractor) Extract(path string) (string, error)

type SQLExtractor

type SQLExtractor struct{}

SQLExtractor handles SQL dumps by extracting comments and INSERT values.

func (*SQLExtractor) Extract

func (e *SQLExtractor) Extract(path string) (string, error)

type TextExtractor

type TextExtractor struct{}

TextExtractor handles plain text and markdown.

func (*TextExtractor) Extract

func (e *TextExtractor) Extract(path string) (string, error)

type Walker

type Walker struct {
	IgnoreList map[string]bool
	Registry   *ExtractorRegistry
}

Walker recursively walks a directory and sends file contents to a channel.

func NewWalker

func NewWalker() *Walker

NewWalker creates a Walker with default ignore list and extractors.

func (*Walker) Walk

func (w *Walker) Walk(root string) (<-chan FileResult, <-chan error)

Walk recursively walks root and sends FileResults to the returned channel. The channel is closed when all files have been sent.

type XMLExtractor

type XMLExtractor struct{}

XMLExtractor handles generic XML files.

func (*XMLExtractor) Extract

func (e *XMLExtractor) Extract(path string) (string, error)

type XlsxExtractor

type XlsxExtractor struct{}

XlsxExtractor handles .xlsx files using excelize.

func (*XlsxExtractor) Extract

func (e *XlsxExtractor) Extract(path string) (string, error)

type YAMLExtractor

type YAMLExtractor struct{}

YAMLExtractor handles YAML files.

func (*YAMLExtractor) Extract

func (e *YAMLExtractor) Extract(path string) (string, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL