Skip to content

Commit 6551c5a

Browse files
committed
chore: wip
1 parent 3fa4b39 commit 6551c5a

30 files changed

+392
-383
lines changed

README.md

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ const scraper = createScraper({
4949

5050
// Scrape a website
5151
const result = await scraper.scrape('https://example.com', {
52-
extract: (doc) => ({
52+
extract: doc => ({
5353
title: doc.querySelector('title')?.textContent,
5454
headings: Array.from(doc.querySelectorAll('h1')).map(h => h.textContent),
5555
}),
@@ -103,19 +103,19 @@ const scraper = createScraper({
103103
Extract and transform data using pipelines:
104104

105105
```typescript
106-
import { pipeline, extractors } from 'ts-web-scraper'
106+
import { extractors, pipeline } from 'ts-web-scraper'
107107

108108
const extractProducts = pipeline()
109109
.step(extractors.structured('.product', {
110110
name: '.product-name',
111111
price: '.product-price',
112112
rating: '.rating',
113113
}))
114-
.map('parse-price', (p) => ({
114+
.map('parse-price', p => ({
115115
...p,
116-
price: parseFloat(p.price.replace(/[^0-9.]/g, '')),
116+
price: Number.parseFloat(p.price.replace(/[^0-9.]/g, '')),
117117
}))
118-
.filter('in-stock', (products) => products.every(p => p.price > 0))
118+
.filter('in-stock', products => products.every(p => p.price > 0))
119119
.sort('by-price', (a, b) => a.price - b.price)
120120

121121
const result = await extractProducts.execute(document)
@@ -130,13 +130,13 @@ const scraper = createScraper({ trackChanges: true })
130130

131131
// First scrape
132132
const result1 = await scraper.scrape('https://example.com', {
133-
extract: (doc) => ({ price: doc.querySelector('.price')?.textContent }),
133+
extract: doc => ({ price: doc.querySelector('.price')?.textContent }),
134134
})
135135
// result1.changed === undefined (no previous snapshot)
136136

137137
// Second scrape
138138
const result2 = await scraper.scrape('https://example.com', {
139-
extract: (doc) => ({ price: doc.querySelector('.price')?.textContent }),
139+
extract: doc => ({ price: doc.querySelector('.price')?.textContent }),
140140
})
141141
// result2.changed === false (if price hasn't changed)
142142
```
@@ -168,7 +168,7 @@ Automatically traverse paginated content:
168168

169169
```typescript
170170
for await (const page of scraper.scrapeAll('https://example.com/posts', {
171-
extract: (doc) => ({
171+
extract: doc => ({
172172
posts: extractors.structured('article', {
173173
title: 'h2',
174174
content: '.content',
@@ -204,9 +204,9 @@ Validate extracted data against schemas:
204204

205205
```typescript
206206
const result = await scraper.scrape('https://example.com', {
207-
extract: (doc) => ({
207+
extract: doc => ({
208208
title: doc.querySelector('title')?.textContent,
209-
price: parseFloat(doc.querySelector('.price')?.textContent || '0'),
209+
price: Number.parseFloat(doc.querySelector('.price')?.textContent || '0'),
210210
}),
211211
validate: {
212212
title: { type: 'string', required: true },
@@ -217,7 +217,8 @@ const result = await scraper.scrape('https://example.com', {
217217
if (result.success) {
218218
// Data is valid and typed
219219
console.log(result.data.title, result.data.price)
220-
} else {
220+
}
221+
else {
221222
console.error(result.error)
222223
}
223224
```

docs/.vitepress/components.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// @ts-nocheck
33
// Generated by unplugin-vue-components
44
// Read more: https://github.com/vuejs/core/pull/3399
5+
// biome-ignore lint: disable
56
export {}
67

78
/* prettier-ignore */

docs/advanced/client-side.md

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ if (isCSR) {
3939
console.log('Site uses client-side rendering')
4040
// Use client-side scraper
4141
const data = await scrapeClientSide('https://example.com')
42-
} else {
42+
}
43+
else {
4344
console.log('Site uses server-side rendering')
4445
// Use regular scraper
4546
const scraper = createScraper()
@@ -79,11 +80,11 @@ Analyze JavaScript to find API endpoints:
7980
```typescript
8081
const result = await scrapeClientSide('https://example.com', {
8182
analyzeJavaScript: true,
82-
maxJSFiles: 10, // Analyze up to 10 JS files
83+
maxJSFiles: 10, // Analyze up to 10 JS files
8384
})
8485

8586
console.log('Found API endpoints:')
86-
result.apiEndpoints.forEach(endpoint => {
87+
result.apiEndpoints.forEach((endpoint) => {
8788
console.log(' -', endpoint)
8889
})
8990

@@ -100,13 +101,13 @@ Customize client-side scraping:
100101

101102
```typescript
102103
const result = await scrapeClientSide('https://example.com', {
103-
timeout: 30000, // 30 second timeout
104-
userAgent: 'MyBot/1.0', // Custom user agent
105-
analyzeJavaScript: true, // Analyze JS bundles
106-
findEmbeddedData: true, // Extract embedded data
107-
reconstructAPI: true, // Fetch discovered APIs
108-
maxJSFiles: 5, // Max JS files to analyze
109-
headers: { // Custom headers
104+
timeout: 30000, // 30 second timeout
105+
userAgent: 'MyBot/1.0', // Custom user agent
106+
analyzeJavaScript: true, // Analyze JS bundles
107+
findEmbeddedData: true, // Extract embedded data
108+
reconstructAPI: true, // Fetch discovered APIs
109+
maxJSFiles: 5, // Max JS files to analyze
110+
headers: { // Custom headers
110111
'Accept-Language': 'en-US',
111112
},
112113
})
@@ -184,8 +185,8 @@ Scrape authenticated pages:
184185
```typescript
185186
const result = await scrapeClientSide('https://app.example.com/dashboard', {
186187
headers: {
187-
'Cookie': 'session=abc123; auth_token=xyz789',
188-
'Authorization': 'Bearer your-token-here',
188+
Cookie: 'session=abc123; auth_token=xyz789',
189+
Authorization: 'Bearer your-token-here',
189190
},
190191
})
191192

@@ -212,7 +213,7 @@ console.log('All meta:', result.meta)
212213
Use both approaches together:
213214

214215
```typescript
215-
import { createScraper, scrapeClientSide, isClientSideRendered } from 'ts-web-scraper'
216+
import { createScraper, isClientSideRendered, scrapeClientSide } from 'ts-web-scraper'
216217

217218
async function smartScrape(url: string) {
218219
// Detect rendering method
@@ -226,11 +227,12 @@ async function smartScrape(url: string) {
226227
apis: result.apiResponses,
227228
meta: result.meta,
228229
}
229-
} else {
230+
}
231+
else {
230232
// Server-side rendered - use regular scraper
231233
const scraper = createScraper()
232234
const result = await scraper.scrape(url, {
233-
extract: (doc) => ({
235+
extract: doc => ({
234236
title: doc.querySelector('h1')?.textContent,
235237
content: doc.querySelector('.content')?.textContent,
236238
}),
@@ -310,8 +312,8 @@ try {
310312
if (Object.keys(result.embeddedData).length === 0) {
311313
console.warn('No embedded data found')
312314
}
313-
314-
} catch (error) {
315+
}
316+
catch (error) {
315317
console.error('Client-side scraping failed:', error)
316318
// Fall back to regular scraping
317319
}
@@ -356,12 +358,12 @@ const result = await scrapeClientSide('https://example.com', {
356358
maxJSFiles: 5,
357359
analyzeJavaScript: true,
358360
findEmbeddedData: true,
359-
reconstructAPI: false, // Skip if you only need structure
361+
reconstructAPI: false, // Skip if you only need structure
360362
})
361363

362364
// Priority order: embedded -> API -> meta
363-
const data =
364-
Object.keys(result.embeddedData).length > 0
365+
const data
366+
= Object.keys(result.embeddedData).length > 0
365367
? result.embeddedData
366368
: result.apiResponses.size > 0
367369
? Object.fromEntries(result.apiResponses)

docs/advanced/graphql.md

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ Detect and interact with GraphQL APIs for efficient data extraction.
77
Automatically detect GraphQL endpoints:
88

99
```typescript
10-
import { detectGraphQL } from 'ts-web-scraper'
11-
import { createScraper } from 'ts-web-scraper'
10+
import { createScraper, detectGraphQL } from 'ts-web-scraper'
1211

1312
const scraper = createScraper()
1413

@@ -21,7 +20,7 @@ const detection = detectGraphQL(result.html, result.url)
2120
console.log('Has GraphQL:', detection.hasGraphQL)
2221
console.log('Endpoints found:', detection.endpoints.length)
2322

24-
detection.endpoints.forEach(endpoint => {
23+
detection.endpoints.forEach((endpoint) => {
2524
console.log(' URL:', endpoint.url)
2625
console.log(' Method:', endpoint.method)
2726
})
@@ -38,7 +37,8 @@ const isValid = await verifyGraphQLEndpoint('https://api.example.com/graphql')
3837

3938
if (isValid) {
4039
console.log('Valid GraphQL endpoint')
41-
} else {
40+
}
41+
else {
4242
console.log('Not a GraphQL endpoint')
4343
}
4444
```
@@ -54,7 +54,7 @@ const client = new GraphQLClient({
5454
endpoint: 'https://api.example.com/graphql',
5555
method: 'POST',
5656
headers: {
57-
'Authorization': 'Bearer your-token',
57+
Authorization: 'Bearer your-token',
5858
},
5959
})
6060

@@ -80,7 +80,8 @@ const response = await client.query({
8080

8181
if (response.data) {
8282
console.log('User:', response.data.user)
83-
} else if (response.errors) {
83+
}
84+
else if (response.errors) {
8485
console.error('GraphQL errors:', response.errors)
8586
}
8687
```
@@ -182,10 +183,10 @@ console.log('Mutation type:', schema?.mutationType?.name)
182183
console.log('Types:', schema?.types?.length)
183184

184185
// Explore available types
185-
schema?.types?.forEach(type => {
186+
schema?.types?.forEach((type) => {
186187
console.log(`Type: ${type.name}`)
187188
if (type.fields) {
188-
type.fields.forEach(field => {
189+
type.fields.forEach((field) => {
189190
console.log(` - ${field.name}: ${field.type.name}`)
190191
})
191192
}
@@ -207,7 +208,8 @@ const isHealthy = await client.healthCheck()
207208

208209
if (isHealthy) {
209210
console.log('GraphQL endpoint is healthy')
210-
} else {
211+
}
212+
else {
211213
console.error('GraphQL endpoint is down')
212214
}
213215
```
@@ -243,7 +245,7 @@ const jsCode = `
243245
const queries = extractGraphQLQueries(jsCode)
244246

245247
console.log('Found queries:', queries.length)
246-
queries.forEach(query => {
248+
queries.forEach((query) => {
247249
console.log(query)
248250
})
249251
```
@@ -253,8 +255,7 @@ queries.forEach(query => {
253255
Automatic detection and extraction:
254256

255257
```typescript
256-
import { createScraper } from 'ts-web-scraper'
257-
import { detectGraphQL, GraphQLClient } from 'ts-web-scraper'
258+
import { createScraper, detectGraphQL, GraphQLClient } from 'ts-web-scraper'
258259

259260
const scraper = createScraper()
260261

@@ -291,7 +292,8 @@ if (detection.hasGraphQL && detection.endpoints.length > 0) {
291292
})
292293

293294
console.log('Products:', response.data?.products)
294-
} catch (error) {
295+
}
296+
catch (error) {
295297
console.error('GraphQL query failed:', error)
296298
}
297299
}
@@ -336,7 +338,7 @@ import { GraphQLClient } from 'ts-web-scraper'
336338

337339
const client = new GraphQLClient({
338340
endpoint: 'https://api.example.com/graphql',
339-
retries: 3, // Retry on failure
341+
retries: 3, // Retry on failure
340342
})
341343

342344
const response = await client.query({
@@ -351,7 +353,7 @@ const response = await client.query({
351353

352354
if (response.errors) {
353355
console.error('GraphQL returned errors:')
354-
response.errors.forEach(error => {
356+
response.errors.forEach((error) => {
355357
console.error(' -', error.message)
356358
if (error.locations) {
357359
console.error(' at line', error.locations[0].line)
@@ -372,7 +374,7 @@ if (response.data) {
372374
Analyze query types:
373375

374376
```typescript
375-
import { getOperationType, getOperationName } from 'ts-web-scraper'
377+
import { getOperationName, getOperationType } from 'ts-web-scraper'
376378

377379
const query1 = `
378380
query GetUser($id: ID!) {
@@ -391,11 +393,11 @@ const query2 = `
391393
}
392394
`
393395

394-
console.log('Query 1 type:', getOperationType(query1)) // "query"
395-
console.log('Query 1 name:', getOperationName(query1)) // "GetUser"
396+
console.log('Query 1 type:', getOperationType(query1)) // "query"
397+
console.log('Query 1 name:', getOperationName(query1)) // "GetUser"
396398

397-
console.log('Query 2 type:', getOperationType(query2)) // "mutation"
398-
console.log('Query 2 name:', getOperationName(query2)) // "UpdateUser"
399+
console.log('Query 2 type:', getOperationType(query2)) // "mutation"
400+
console.log('Query 2 name:', getOperationName(query2)) // "UpdateUser"
399401
```
400402

401403
## GET Requests
@@ -407,7 +409,7 @@ import { GraphQLClient } from 'ts-web-scraper'
407409

408410
const client = new GraphQLClient({
409411
endpoint: 'https://api.example.com/graphql',
410-
method: 'GET', // Use GET instead of POST
412+
method: 'GET', // Use GET instead of POST
411413
})
412414

413415
// Query is encoded in URL parameters
@@ -423,12 +425,7 @@ console.log('Products:', response.data?.products)
423425
Full GraphQL scraping workflow:
424426

425427
```typescript
426-
import { createScraper } from 'ts-web-scraper'
427-
import {
428-
detectGraphQL,
429-
verifyGraphQLEndpoint,
430-
GraphQLClient,
431-
} from 'ts-web-scraper'
428+
import { createScraper, detectGraphQL, GraphQLClient, verifyGraphQLEndpoint } from 'ts-web-scraper'
432429

433430
async function scrapeGraphQLSite(url: string) {
434431
// 1. Scrape the page
@@ -489,12 +486,13 @@ async function scrapeGraphQLSite(url: string) {
489486
})
490487

491488
console.log('Available queries:')
492-
response.data?.__type?.fields?.forEach(field => {
489+
response.data?.__type?.fields?.forEach((field) => {
493490
console.log(` - ${field.name}: ${field.description}`)
494491
})
495492

496493
return { client, schema, endpoint: endpoint.url }
497-
} catch (error) {
494+
}
495+
catch (error) {
498496
console.error('Failed to query endpoint:', error)
499497
}
500498
}
@@ -530,8 +528,8 @@ import { GraphQLClient } from 'ts-web-scraper'
530528
const client = new GraphQLClient({
531529
endpoint: 'https://api.example.com/graphql',
532530
method: 'POST',
533-
timeout: 30000, // 30 second timeout
534-
retries: 3, // Retry up to 3 times
531+
timeout: 30000, // 30 second timeout
532+
retries: 3, // Retry up to 3 times
535533
headers: {
536534
'User-Agent': 'MyBot/1.0',
537535
},

0 commit comments

Comments
 (0)