@@ -33,7 +33,7 @@ class LGCrawler
33
33
34
34
/**
35
35
* Crawler constructor
36
- *
36
+ *
37
37
* @param [string] $sitename website name
38
38
* @param [string] $siteurl website url
39
39
* @param [resource] $SQLConn mysql connection
@@ -52,7 +52,7 @@ public function __construct($sitename, $siteurl, $SQLConn)
52
52
53
53
/**
54
54
* method to log messages
55
- *
55
+ *
56
56
* @param string $file the log file name
57
57
* @param string $text the log message
58
58
*/
@@ -126,7 +126,8 @@ private function runCrawler($url = null)
126
126
// store all links in `$url`s content
127
127
// for later crawling
128
128
$ this ->getLinks (
129
- $ url , function ($ pageURL ) use ($ fileContent , &$ links , &$ crawledPages , $ url ) {
129
+ $ url ,
130
+ function ($ pageURL ) use ($ fileContent , &$ links , &$ crawledPages , $ url ) {
130
131
131
132
// callback
132
133
if (isset ($ this ->onCrawlCallback [0 ])) {
@@ -141,8 +142,8 @@ private function runCrawler($url = null)
141
142
if ($ this ->isAlike ($ this ->siteurl , $ pageURL ) && !in_array ($ pageURL , $ crawledPages )) {
142
143
array_push ($ links , $ pageURL );
143
144
}
144
-
145
- }, $ fileContent /* page content provided so this method wont need to fetch the content again */
145
+ },
146
+ $ fileContent /* page content provided so this method wont need to fetch the content again */
146
147
);
147
148
148
149
// crawl all links in the `$links` array
@@ -155,10 +156,10 @@ private function runCrawler($url = null)
155
156
156
157
/**
157
158
* convert relative url to absolute url
158
- *
159
+ *
159
160
* @param [string] $rel relative url
160
161
* @param [string] $base base url
161
- *
162
+ *
162
163
* @return [string] absolute url
163
164
*/
164
165
private function rel2abs ($ rel , $ base )
@@ -258,10 +259,10 @@ private function getLinks($u, $callback, $content = '')
258
259
259
260
/**
260
261
* method to add pages to the db as we crawl
261
- *
262
+ *
262
263
* @param [string] $link page url
263
264
* @param [string] $content page content
264
- *
265
+ *
265
266
* @return [boolean] page added or not
266
267
*/
267
268
private function addPageToDatabase ($ link , $ content )
@@ -296,10 +297,10 @@ private function addPageToDatabase($link, $content)
296
297
$ dom = str_get_html ($ content );
297
298
298
299
// get <strong>, <b>, <em> tags from page
299
- $ pageEmphasis = $ this ->getPageElems ($ dom , $ content ,"strong,em,b " );
300
+ $ pageEmphasis = $ this ->getPageElems ($ dom , $ content , "strong,em,b " );
300
301
301
302
// get headers <h1>-<h6>
302
- $ pageHeaders = $ this ->getPageElems ($ dom , $ content ,"h1,h2,h3,h4,h5,h6 " );
303
+ $ pageHeaders = $ this ->getPageElems ($ dom , $ content , "h1,h2,h3,h4,h5,h6 " );
303
304
304
305
// strip out tags and remove useless html elements
305
306
$ content = $ this ->stripTags ($ content );
@@ -375,12 +376,13 @@ private function insertSiteInToDB()
375
376
* @param [DOMObject] $dom html node object from 'simple_html_dom' lib
376
377
* @param [Array[string]] $selectors selectors to be removed from dom
377
378
*/
378
- private function removeElem ($ dom , $ selectors ) {
379
+ private function removeElem ($ dom , $ selectors )
380
+ {
379
381
foreach ($ selectors as $ selector ) {
380
- $ elems = $ dom ->find ($ selector );
381
- foreach ($ elems as $ E ) {
382
- $ E ->innertext = "" ;
383
- }
382
+ $ elems = $ dom ->find ($ selector );
383
+ foreach ($ elems as $ E ) {
384
+ $ E ->innertext = "" ;
385
+ }
384
386
}
385
387
}
386
388
@@ -392,20 +394,21 @@ private function removeElem($dom, $selectors) {
392
394
*
393
395
* @return [string] string containing tag content
394
396
*/
395
- private function getPageElems ($ dom , $ content , $ tags ) {
397
+ private function getPageElems ($ dom , $ content , $ tags )
398
+ {
396
399
$ headers = $ dom ->find ($ tags );
397
400
$ str = "" ;
398
401
foreach ($ headers as $ h ) {
399
- $ str .= preg_replace ("#&[a-z0-9]+;#i " , "" , $ h ->plaintext ) . " " ;
402
+ $ str .= preg_replace ("#&[a-z0-9]+;#i " , "" , $ h ->plaintext ) . " " ;
400
403
}
401
404
return strip_tags ($ str );
402
405
}
403
406
404
407
/**
405
408
* strip out tags from html document
406
- *
409
+ *
407
410
* @param [string] $string HTML string
408
- *
411
+ *
409
412
* @return [string] HTML with tags stripped
410
413
*/
411
414
private function stripTags ($ string )
@@ -447,7 +450,7 @@ private function stripTags($string)
447
450
* check if string is html
448
451
*
449
452
* @param [string] $string string to check
450
- *
453
+ *
451
454
* @return [boolean] html or not
452
455
*/
453
456
private function isHTML ($ string )
@@ -458,10 +461,10 @@ private function isHTML($string)
458
461
/**
459
462
* check if urls are alike
460
463
* so as to prevent the crawler from exceeding its boundaries
461
- *
464
+ *
462
465
* @param [string] $url1 original url
463
466
* @param [string] $url2 test url
464
- *
467
+ *
465
468
* @return [boolean] alike or not
466
469
*/
467
470
private function isAlike ($ url , $ testUrl )
@@ -492,7 +495,7 @@ private function isAlike($url, $testUrl)
492
495
493
496
/**
494
497
* Takes a url and returns false (if its inaccessible) else it contents
495
- *
498
+ *
496
499
* @param [string] $url url to fetch
497
500
*/
498
501
private function getPageContent ($ url )
@@ -504,7 +507,7 @@ private function getPageContent($url)
504
507
* delete multiple whitespaces
505
508
*
506
509
* @param [string] $value string to trim
507
- *
510
+ *
508
511
* @return [string] trimmed string
509
512
*/
510
513
private function _trim ($ str )
0 commit comments