Adding scrape_options
This commit is contained in:
parent
696669aae4
commit
548307102d
20
.env.sample
20
.env.sample
@ -18,4 +18,22 @@ CRAWLER_COMPANY_FILE=data/verint-responsible-ethical-ai.pdf
|
|||||||
FIRECRAWL_API_KEY=your-firecrawl-api-key
|
FIRECRAWL_API_KEY=your-firecrawl-api-key
|
||||||
FIRECRAWL_API_URL=http://localhost:3002
|
FIRECRAWL_API_URL=http://localhost:3002
|
||||||
FIRECRAWL_MODE=crawl
|
FIRECRAWL_MODE=crawl
|
||||||
FIRECRAWL_PARAMS='{ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": true, "poll_interval": 5 }'
|
# Pass in the parameters as a JSON string
|
||||||
|
#
|
||||||
|
# include_paths (Optional[List[str]]): Patterns of URLs to include
|
||||||
|
# exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
||||||
|
# max_depth (Optional[int]): Maximum crawl depth
|
||||||
|
# max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||||
|
# limit (Optional[int]): Maximum pages to crawl
|
||||||
|
# allow_backward_links (Optional[bool]): Follow parent directory links
|
||||||
|
# allow_external_links (Optional[bool]): Follow external domain links
|
||||||
|
# ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||||
|
# scrape_options (Optional[ScrapeOptions]): Page scraping configuration (see https://docs.firecrawl.dev/advanced-scraping-guide#scrape-options)
|
||||||
|
# webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
||||||
|
# deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
||||||
|
# ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
||||||
|
# regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
||||||
|
# delay (Optional[int]): Delay in seconds between scrapes
|
||||||
|
# poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
||||||
|
# idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
|
FIRECRAWL_PARAMS='{ "limit": 100, "include_paths": ["/.*"], "ignore_sitemap": true, "poll_interval": 5, "scrape_options": { "headers": { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.4 Safari/605.1.15"}}}'
|
||||||
Loading…
x
Reference in New Issue
Block a user