Please note that some of these scripts are used to query search engines. Take extra caution before using these scripts in particular.
<html> <head> <title>Search Engine Web Page Viewer</title> </head> <body> <form name=mainform action="" method="get"> <table border="0" width="100%" align=center> <tr> <td>Enter URL: <br> <input type="text" name="url" size="20"></td> </tr> <tr> <td> <input type="submit" value="Click to See Search Engine View" name="submit"> </td> </tr> </table> </form> <hr> <?php $myurl = $_GET['url']; if (isset($myurl)) { print spiderViewer($myurl); } ?> </body> </html> <?php function spiderViewer($url) { $finalHTML=''; if($url) { $originalHTML=get_content($url); if($originalHTML) { $finalHTML.='<table border="0" align="center" width="75%">'; $finalHTML.='<tr><td align="center" valign="top">'; $finalHTML.='<b>Search Engine View for URL:' . $url . '</b></tr>'; $finalHTML.='<tr><td align="left" valign="top">'; $originalHTML=preg_replace('/<script.*?>.*?</script.*?>/sim' ,'', $originalHTML); $originalHTML=preg_replace('/<object.*?>.*?</object.*?>/sim' ,'', $originalHTML); $originalHTML=preg_replace('/<applet.*?>.*?</applet.*?>/sim' ,'', $originalHTML); $originalHTML=preg_replace('/<style.*?>.*?</style.*?>/sim' ,'', $originalHTML); $originalHTML=preg_replace('/<.*?>/sim','',$originalHTML); $originalHTML=preg_replace('/&[#]{0,1}.[^ ]*;/sim',' ' ,$originalHTML); $stopWordsArray=explode("<br />", file_get_contents('stopwords.txt')); for($tmploop=0;$tmploop<count($stopWordsArray);$tmploop++) { $originalHTML=preg_replace('/[W]{1,1}' . $stopWordsArray[$tmploop] . '[W]{1,1}/sim','',$originalHTML); } $originalHTML=preg_replace('/[^A-Z0-9a-z.?!;,- ]*/sim','' ,$originalHTML); $originalHTML=preg_replace('/[ ]{2,1000}/sim',' ' ,$originalHTML); $finalHTML.= $originalHTML . '</td></tr></table>'; } else { $finalHTML='Please check your URL.'; } } else { $finalHTML='The url you entered was invalid.'; } return $finalHTML; } function get_content($url) { $ch = curl_init(); curl_setopt ($ch, CURLOPT_URL, $url); curl_setopt ($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_FAILONERROR, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'), curl_setopt($ch, CURLOPT_TIMEOUT, 30); if(preg_match('/^https:///sim',$url)==true) { curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); } ob_start(); curl_exec ($ch); curl_close ($ch); $string = ob_get_contents(); ob_end_clean(); return $string; }
<html> <head> <style> #navigation { position: absolute; top: 10px; left: 50%; width: 800px; margin-left: −400px; text-align: left; } #content { position: absolute; top: 150px; left: 50%; width: 800px; margin-left: −400px; text-align: left; } body { text-align: center; min-width: 600px; } </style> </head> <body> <div id="content">content<!-- SEO optimized content text goes here.--></div> <div id="navigation">navigation<!-- navigational elements, ads go here--></div> </body> </html>
<html> <head> <style> #navigation { position: absolute; top: 0px; left: 400; width: 200px; margin-left: −400px; text-align: left; } #content { position: absolute; top: 0px; left: 600; width: 600px; margin-left: −400px; text-align: left; } body { text-align: center; min-width: 800px; } </style> </head> <body> <div id="content"> SEO optimized content text goes here.</div> <div id="navigation">navigational elements, ads go here</div> </body> </html>
<html> <head> <style> #top { position: absolute; top: 10px; left: 50%; width: 800px; margin-left: −400px; text-align: left; } #left { position: absolute; top: 150px; left: 50%; width: 200px; margin-left: −400px; text-align: left; } #main { position: absolute; top: 150px; left: 50%; width: 600px; margin-left: −200px; text-align: left; } #right { position: absolute; top: 150px; left: 50%; width: 200px; margin-left: 0px; text-align: left; } body { text-align: center; min-width: 800px; } </style> </head> <body> <div id="main">optimized main body</div> <div id="left">left panel</div> <div id="top">top panel</div> <div id="right">right panel</div> </body> </html>
#!/usr/local/bin/perl ########################################################### # File: rankingfactors.pl # # Description: This script performs analysis on several # # ranking factors including: # # 1) Keywords in Page Titles # # 2) Keywords in Domain Names # # 3) Keywords in Page Copy # # 4) Keywords in Headings # # 5) Keywords in Meta description # # 6) Keyword Proximety # # 7) Keywords in Outbound Links # # 8) Page Size # # 9) Words per Page # # 10) Website Size # # and more... # # # # Format: perl rankingfactors.pl 10|100 keyword(s) # ########################################################### use LWP::Simple; use LWP::UserAgent; use HTML::TokeParser; use HTML::TreeBuilder; use File::Path; use Math::Round qw(:all); my $keyphrase = ""; my @googleLinks = ( ); my @googleTitles = ( ); my @yahooLinks = ( ); my @yahooTitles = ( ); my @bingLinks = ( ); my @bingTitles = ( ); #buid keyphrase/keyword if necessary foreach $argnum (1 .. $#ARGV) { $keyphrase = $keyphrase . "$ARGV[$argnum] "; } my $numres = $ARGV[0]; $keyphrase =~ s/^s+//; $keyphrase =~ s/s+$//; $keyphrase =~ s/'//g; $keyphrase =~ s/"//g; print " Starting.."; #cleanup temp files rmtree( './serptemp', {keep_root => 1} ); print " ..cleanup done"; #initialize variables initializeKeyVariables($keyphrase, @googleLinks, @googleTitles, @yahooLinks, @yahooTitles, @bingLinks, @bingTitles); #let's store all destination links found on SERPs print " ..getting SERPs"; getSERPResults($#googleLinks, @googleLinks, "google"); getSERPResults($#yahooLinks,@yahooLinks, "yahoo"); getSERPResults($#bingLinks,@bingLinks, "bing"); print " ..got the SERPs"; #-------------------TITLE Analysis----------------------- #get real titles my @googleRealTitles = ( ); my @yahooRealTitles = ( ); my @bingRealTitles = ( ); getRealTitles($#googleLinks, @googleRealTitles, "google"); getRealTitles($#yahooLinks,@yahooRealTitles, "yahoo"); getRealTitles($#bingLinks,@bingRealTitles, "bing"); print " ..got the real titles"; #compare real titles with titles on SERPs my @googleTitleComp = ( ); my @yahooTitleComp = ( ); my @bingTitleComp = ( ); my $percentMatchTitlesGoogle = compareArrays($#googleTitles,@googleRealTitles, @googleTitles, @googleTitleComp); my $percentMatchTitlesYahoo = compareArrays($#yahooTitles,@yahooRealTitles, @yahooTitles, @yahooTitleComp); my $percentMatchTitlesBing = compareArrays($#bingTitles,@bingRealTitles, @bingTitles, @bingTitleComp); print " ..finished partial title comparisons"; #find keyword title matches my @googleKeywordTitleMatch = ( ); my @yahooKeywordTitleMatch = ( ); my @bingKeywordTitleMatch = ( ); getKeywordsTitleMatch($keyphrase, @googleRealTitles,$#googleRealTitles, @googleKeywordTitleMatch ); getKeywordsTitleMatch($keyphrase, @yahooRealTitles, $#yahooRealTitles, @yahooKeywordTitleMatch); getKeywordsTitleMatch($keyphrase, @bingRealTitles, $#bingRealTitles, @bingKeywordTitleMatch); print " ..finished keyword title comparisons"; #find if keyword in title found in page copy my @googleKeywordTitlePageCopy = ( ); my @yahooKeywordTitlePageCopy = ( ); my @bingKeywordTitlePageCopy = ( ); compareTitlePageCopy($#googleRealTitles,@googleRealTitles, @googleKeywordTitlePageCopy, "google"); compareTitlePageCopy($#yahooRealTitles,@yahooRealTitles, @yahooKeywordTitlePageCopy, "yahoo"); compareTitlePageCopy($#bingRealTitles,@bingRealTitles, @bingKeywordTitlePageCopy, "bing"); print " ..finished title page copy comparisons"; #-------------------Domain Name Analysis----------------------- #exact match my @googleDomainKeywordExactMatch = ( ); my @yahooDomainKeywordExactMatch = ( ); my @bingDomainKeywordExactMatch = ( ); my $percentDomainKeywordExactMatchGoogle = keywordDomainExactMatch($keyphrase, @googleLinks, $#googleLinks, @googleDomainKeywordExactMatch); my $percentDomainKeywordExactMatchYahoo = keywordDomainExactMatch($keyphrase, @yahooLinks, $#yahooLinks, @yahooDomainKeywordExactMatch); my $percentDomainKeywordExactMatchBing = keywordDomainExactMatch($keyphrase, @bingLinks, $#bingLinks, @bingDomainKeywordExactMatch); print " ..finished domain name exact keyword analysis"; #partial match my @googleDomainKeywordPartialMatch = ( ); my @yahooDomainKeywordPartialMatch = ( ); my @bingDomainKeywordPartialMatch = ( ); $percentDomainKeywordPartialMatchGoogle = keywordDomainPartialMatch($keyphrase, @googleLinks, $#googleLinks, @googleDomainKeywordPartialMatch); $percentDomainKeywordPartialMatchYahoo = keywordDomainPartialMatch($keyphrase, @yahooLinks, $#yahooLinks, @yahooDomainKeywordPartialMatch); $percentDomainKeywordPartialMatchBing = keywordDomainPartialMatch($keyphrase, @bingLinks, $#bingLinks, @bingDomainKeywordPartialMatch); print " ..finished domain name partial keyword analysis"; #-------------------Page Copy Analysis---------------------------- my @googleKeywordDensity = ( ); my @yahooKeywordDensity = ( ); my @bingKeywordDensity = ( ); my $googleAvgDensity = keywordDensity($#googleLinks, $keyphrase, @googleKeywordDensity, "google"); my $yahooAvgDensity = keywordDensity($#yahooLinks, $keyphrase, @yahooKeywordDensity, "yahoo"); my $bingAvgDensity = keywordDensity($#bingLinks, $keyphrase, @bingKeywordDensity, "bing"); #-------------------Description META Tag Analysis------------------ my @googleDescriptionMetaExact = ( ); my @yahooDescriptionMetaExact = ( ); my @bingDescriptionMetaExact = ( ); checkExactDescriptionMeta($#googleLinks, @googleDescriptionMetaExact, $keyphrase, "google"); checkExactDescriptionMeta($#yahooLinks, @yahooDescriptionMetaExact, $keyphrase, "yahoo"); checkExactDescriptionMeta($#bingLinks, @bingDescriptionMetaExact, $keyphrase, "bing"); my @googleDescriptionMetaPartial = ( ); my @yahooDescriptionMetaPartial = ( ); my @bingDescriptionMetaPartial = ( ); checkPartialDescriptionMeta($#googleLinks, @googleDescriptionMetaPartial, $keyphrase, "google"); checkPartialDescriptionMeta($#yahooLinks, @yahooDescriptionMetaPartial, $keyphrase, "yahoo"); checkPartialDescriptionMeta($#bingLinks, @bingDescriptionMetaPartial, $keyphrase, "bing"); print " ..finished description META analysis"; #-------------------Header Tag Analysis---------------------------- my @googleNumberOfHeaderTags = ( ); my @yahooNumberOfHeaderTags = ( ); my @bingNumberOfHeaderTags = ( ); my @googleHeaderTagsKeywords = ( ); my @yahooHeaderTagsKeywords = ( ); my @bingHeaderTagsKeywords = ( ); checkHeaderTags($#googleLinks, @googleNumberOfHeaderTags, @googleHeaderTagsKeywords, "google", $keyphrase); checkHeaderTags($#yahooLinks, @yahooNumberOfHeaderTags, @yahooHeaderTagsKeywords, "yahoo", $keyphrase); checkHeaderTags($#bingLinks, @bingNumberOfHeaderTags, @bingHeaderTagsKeywords, "bing", $keyphrase); print " ..finished header tags analysis"; #-------------------Keyword Proximity Analysis--------------------- my @googleKeywordPositions = ( ); my @yahooKeywordPositions = ( ); my @bingKeywordPositions = ( ); my @googleKeywordPositionsList = ( ); my @yahooKeywordPositionsList = ( ); my @bingKeywordPositionsList = ( ); analyzeKeywordPositions($#googleLinks, @googleKeywordPositions, @googleKeywordPositionsList, "google", $keyphrase); analyzeKeywordPositions($#yahooLinks, @yahooKeywordPositions, @yahooKeywordPositionsList, "yahoo", $keyphrase); analyzeKeywordPositions($#bingLinks, @bingKeywordPositions, @bingKeywordPositionsList, "bing", $keyphrase); print " ..finished keyword proximity analysis"; #-------------------Outbound Link Analysis-------------------------- my @googleOutboundLinkKeywords = ( ); my @yahooKOutboundLinkKeywords = ( ); my @bingOutboundLinkKeywords = ( ); outboundLinkKeywordAnalysis($#googleLinks, @googleLinks, @googleOutboundLinkKeywords, "google", $keyphrase); outboundLinkKeywordAnalysis($#yahooLinks, @yahooLinks, @yahooKOutboundLinkKeywords, "yahoo", $keyphrase); outboundLinkKeywordAnalysis($#bingLinks, @bingLinks, @bingOutboundLinkKeywords, "bing", $keyphrase); print " ..finished outbound links analysis"; #-------------------Outbound Link PR Analysis-------------------------- my @googleOutboundLinksPR = ( ); my @yahooKOutboundLinksPR = ( ); my @bingOutboundLinksPR = ( ); outboundLinkPRAnalysis($#googleLinks, @googleLinks, @googleOutboundLinksPR, "google", $keyphrase); outboundLinkPRAnalysis($#yahooLinks, @yahooLinks, @yahooKOutboundLinksPR, "yahoo", $keyphrase); outboundLinkPRAnalysis($#bingLinks, @bingLinks, @bingOutboundLinksPR, "bing", $keyphrase); print " ..finished outbound link PR analysis"; #-------------------Average Page Size Analysis-------------------------- my @googlePageSize = ( ); my @yahooPageSize = ( ); my @bingPageSize = ( ); my $googleAvgPageSize = averagePageSize($#googleLinks, @googlePageSize, "google"); my $yahooAvgPageSize = averagePageSize($#yahooLinks, @yahooPageSize, "yahoo"); my $bingAvgPageSize = averagePageSize($#bingLinks, @bingPageSize, "bing"); print " ..finished average page size analysis"; #-------------------Optimum Number of Words Analysis-------------------- my @googleWords = ( ); my @yahooWords = ( ); my @bingWords = ( ); my $googleWordsPerPage = optimumWordsPerPage($#googleLinks, @googleWords, "google"); my $yahooWordsPerPage = optimumWordsPerPage($#yahooLinks, @yahooWords, "yahoo"); my $bingWordsPerPage = optimumWordsPerPage($#bingLinks, @bingWords, "bing"); print " ..finished optimum number of words analysis"; #-------------------Website Size Analysis------------------------------- my @googleResultsWebsiteSizes = ( ); my @yahooResultsWebsiteSizes = ( ); my @bingResultsWebsiteSizes = ( ); my $googleAverageWebSize = analyzeWebsiteSize($#googleLinks, @googleLinks, @googleResultsWebsiteSizes); my $yahooAverageWebSize = analyzeWebsiteSize($#yahooLinks, @yahooLinks, @yahooResultsWebsiteSizes); my $bingAverageWebSize = analyzeWebsiteSize($#bingLinks, @bingLinks, @bingResultsWebsiteSizes); print " ..finished website size analysis"; #-------------------Page Age Analysis----------------------------------- my @googlePageAge = ( ); my @yahooPageAge = ( ); my @bingPageAge = ( ); pageAgeAnalysis($#googleLinks, @googleLinks, @googlePageAge); pageAgeAnalysis($#yahooLinks, @yahooLinks, @yahooPageAge); pageAgeAnalysis($#bingLinks, @bingLinks, @bingPageAge); #-------------------Create HTML Report--------------------------------- #create index file createIndexHTML($keyphrase); my $numberOfLinesGoogle = $#googleLinks; my $numberOfLinesYahoo = $#yahooLinks; my $numberOfLinesBing = $#bingLinks; createGoogleHTMLReport(); createYahooHTMLReport(); createBingHTMLReport(); #---------------------------SUBROUTINES--------------------------- # Subroutine: # createGoogleHTMLReport # Description: # This subroutine creates google.html file # which summerizes Google SERP findings # Inputs: # None # Outputs: # Creates google.html # Returns: # Returns nothing sub createGoogleHTMLReport { #create summary table first my $googleFile = "<html><head><title>Detailed Summary for Google</title>"; $googleFile .= "<style>"; $googleFile .= "body, td, tr{font-family: "Trebuchet ms", verdana, sans-serif; font-size:9px;}"; $googleFile .= "b{font-family: "Trebuchet ms", verdana, sans-serif;font-size:10px;}"; $googleFile .= "</style>"; $googleFile .= "</head>"; $googleFile .= "<body><h1>Ranking Report Summary</h1>"; $googleFile .= "<br>"; $googleFile .= "<table border="1" width="500" cellspacing="2" cellpadding="2">"; $googleFile .= "<tr><td colspan=2><b>Averages</b></td>"; $googleFile .= "</tr>"; $googleFile .= "<tr>"; $googleFile .= "<td><b>% Title Match</b></td>"; my $tmp = sprintf "%.1f", $percentMatchTitlesGoogle; $googleFile .= "<td>$tmp</td>"; $googleFile .= "</tr>"; $googleFile .= "<tr>"; $googleFile .= "<td><b>% Keyword Domain Exact Match</b></td>"; $tmp = sprintf "%.1f", $percentDomainKeywordExactMatchGoogle; $googleFile .= "<td>$tmp</td>"; $googleFile .= "</tr>"; $googleFile .= "<tr>"; $googleFile .= "<td><b>% Keyword Domain Partial Match</b></td>"; $tmp = sprintf "%.1f", $percentDomainKeywordPartialMatchGoogle; $googleFile .= "<td>$tmp</td>"; $googleFile .= "</tr>"; $googleFile .= "<tr>"; $googleFile .= "<td><b>% Keyword Density</b></td>"; $tmp = sprintf "%.1f", $googleAvgDensity; $googleFile .= "<td>$tmp</td>"; $googleFile .= "</tr>"; $googleFile .= "<tr>"; $googleFile .= "<td><b>Page Size [bytes]</b></td>"; $tmp = sprintf "%.0f", $googleAvgPageSize; $googleFile .= "<td>$tmp</td>"; $googleFile .= "</tr>"; $googleFile .= "<tr>"; $googleFile .= "<td><b>Words Per Page</b></td>"; $tmp = sprintf "%.0f", $googleWordsPerPage; $googleFile .= "<td>$tmp</td>"; $googleFile .= "</tr>"; $googleFile .= "<tr>"; $googleFile .= "<td><b>Website Size [of base url]</b></td>"; $tmp = round($googleAverageWebSize); $googleFile .= "<td>$tmp</td>"; $googleFile .= "</tr>"; $googleFile .= "</table><br><br>"; $googleFile .= "<b>Detail Table</b> <br>"; $googleFile .= "<table border=1 cellpadding=2 cellspacing=2>"; $googleFile .= "<tr>"; $googleFile .= "<td nowrap>#</td>"; $googleFile .= "<td width='100'><b>URL</b></td>"; $googleFile .= "<td nowrap width='150'><b>Google Title</b></td>"; $googleFile .= "<td nowrap width='150'><b>Page Title</b></td>"; $googleFile .= "<td nowrap><b>Keyword(s) found<br> in Title? [Y|N]</b></td>"; $googleFile .= "<td nowrap><b>Title Keywords <br>In Page Copy [%]</b></td>"; $googleFile .= "<td nowrap><b>Domain name <br>Exact Match</b></td>"; $googleFile .= "<td nowrap><b>Domain name <br>Partial Match</b></td>"; $googleFile .= "<td nowrap><b>Keyword Density</b></td>"; $googleFile .= "<td nowrap><b>META Description<br> Exact Match</b></td>"; $googleFile .= "<td nowrap><b>META Description<br> Partial Match</b></td>"; $googleFile .= "<td nowrap><b>Header Tags</b></td>"; $googleFile .= "<td nowrap><b>Header Tag <br>Keywords</b></td>"; $googleFile .= "<td nowrap width='350'><b>Keyword Positions in Page</b></td>"; $googleFile .= "<td nowrap><b>Keyword Prominence Map</b></td>"; $googleFile .= "<td nowrap><b>Outbound Links with Keywords</b></td>"; $googleFile .= "<td nowrap width='150'><b>Outbound Link<br> PRs</b></td>"; $googleFile .= "<td nowrap><b>Page Size <br>[bytes]</b></td>"; $googleFile .= "<td nowrap><b>Words in<br> Page</b></td>"; $googleFile .= "<td nowrap><b>Website Size</b></td>"; $googleFile .= "<td nowrap><b>Page Age</b></td>"; $googleFile .= "</tr>"; for (my $i=0; $i < $numberOfLinesGoogle; $i++) { $googleFile .= "<tr>"; $googleFile .= "<td align=left>$i </td>"; $googleFile .= "<td align=left>$googleLinks[$i] </td>"; $googleFile .= "<td align=left>$googleTitles[$i] </td>"; $googleFile .= "<td align=left>$googleRealTitles[$i] </td>"; $googleFile .= "<td align=left>$googleKeywordTitleMatch[$i] </td>"; $tmp = sprintf "%.1f", $googleKeywordTitlePageCopy[$i]; $googleFile .= "<td align=left>$tmp </td>"; $googleFile .= "<td align=left>$googleDomainKeywordExactMatch[$i] </td>"; $googleFile .= "<td align=left>$googleDomainKeywordPartialMatch[$i] </td>"; $tmp = sprintf "%.3f", $googleKeywordDensity[$i]; $googleFile .= "<td align=left>$tmp </td>"; $googleFile .= "<td align=left>$googleDescriptionMetaExact[$i] </td>"; $googleFile .= "<td align=left>$googleDescriptionMetaPartial[$i] </td>"; $googleFile .= "<td align=left>$googleNumberOfHeaderTags[$i] </td>"; $googleFile .= "<td align=left>$googleHeaderTagsKeywords[$i] </td>"; $tmp = $googleKeywordPositionsList[$i]; $tmp =~ s/|/, /g; $googleFile .= "<td align=left>$tmp </td>"; $googleFile .= "<td align=left><a href='./maps/google".$i.".html'>Map</a></td>"; printIndividualKeywordProminenceMap($i, @googleKeywordPositions, "google"); $googleFile .= "<td align=left>$googleOutboundLinkKeywords[$i] </td>"; $googleFile .= "<td align=left>$googleOutboundLinksPR[$i] </td>"; $googleFile .= "<td align=left>$googlePageSize[$i] </td>"; $googleFile .= "<td align=left>$googleWords[$i] </td>"; $googleFile .= "<td align=left>$googleResultsWebsiteSizes[$i] </td>"; $googleFile .= "<td align=left>$googlePageAge[$i] </td>"; $googleFile .= "</tr>"; } my $filename = "./report/google.html"; open FILE, ">", "$filename" or die $!; print FILE $googleFile; close FILE; } # Subroutine: # createYahooHTMLReport # Description: # This subroutine creates yahoo.html file # which summerizes Yahoo SERP findings # Inputs: # None # Outputs: # Creates yahoo.html # Returns: # Returns nothing sub createYahooHTMLReport { #create summary table first my $yahooFile = "<html><head><title>Detailed Summary for Yahoo</title>"; $yahooFile .= "<style>"; $yahooFile .= "body, td, tr{font-family: "Trebuchet ms", verdana, sans-serif; font-size:9px;}"; $yahooFile .= "b{font-family: "Trebuchet ms", verdana, sans-serif;font-size:10px;}"; $yahooFile .= "</style>"; $yahooFile .= "</head>"; $yahooFile .= "<body><h1>Ranking Report Summary</h1>"; $yahooFile .= "<br>"; $yahooFile .= "<table border="1" width="500" cellspacing="2" cellpadding="2">"; $yahooFile .= "<tr><td colspan=2><b>Averages</b></td>"; $yahooFile .= "</tr>"; $yahooFile .= "<tr>"; $yahooFile .= "<td><b>% Title Match</b></td>"; my $tmp = sprintf "%.1f", $percentMatchTitlesYahoo; $yahooFile .= "<td>$tmp</td>"; $yahooFile .= "</tr>"; $yahooFile .= "<tr>"; $yahooFile .= "<td><b>% Keyword Domain Exact Match</b></td>"; $tmp = sprintf "%.1f", $percentDomainKeywordExactMatchYahoo; $yahooFile .= "<td>$tmp</td>"; $yahooFile .= "</tr>"; $yahooFile .= "<tr>"; $yahooFile .= "<td><b>% Keyword Domain Partial Match</b></td>"; $tmp = sprintf "%.1f", $percentDomainKeywordPartialMatchYahoo; $yahooFile .= "<td>$tmp</td>"; $yahooFile .= "</tr>"; $yahooFile .= "<tr>"; $yahooFile .= "<td><b>% Keyword Density</b></td>"; $tmp = sprintf "%.1f", $yahooAvgDensity; $yahooFile .= "<td>$tmp</td>"; $yahooFile .= "</tr>"; $yahooFile .= "<tr>"; $yahooFile .= "<td><b>Page Size [bytes]</b></td>"; $tmp = sprintf "%.0f", $yahooAvgPageSize; $yahooFile .= "<td>$tmp</td>"; $yahooFile .= "</tr>"; $yahooFile .= "<tr>"; $yahooFile .= "<td><b>Words Per Page</b></td>"; $tmp = sprintf "%.0f", $yahooWordsPerPage; $yahooFile .= "<td>$tmp</td>"; $yahooFile .= "</tr>"; $yahooFile .= "<tr>"; $yahooFile .= "<td><b>Website Size [of base url]</b></td>"; $tmp = round($yahooAverageWebSize); $yahooFile .= "<td>$tmp</td>"; $yahooFile .= "</tr>"; $yahooFile .= "</table><br><br>"; $yahooFile .= "<b>Detail Table</b> <br>"; $yahooFile .= "<table border=1 cellpadding=2 cellspacing=2>"; $yahooFile .= "<tr>"; $yahooFile .= "<td nowrap>#</td>"; $yahooFile .= "<td width='100'><b>URL</b></td>"; $yahooFile .= "<td nowrap width='150'><b>Yahoo Title</b></td>"; $yahooFile .= "<td nowrap width='150'><b>Page Title</b></td>"; $yahooFile .= "<td nowrap><b>Keyword(s) found<br> in Title? [Y|N]</b></td>"; $yahooFile .= "<td nowrap><b>Title Keywords <br>In Page Copy [%]</b></td>"; $yahooFile .= "<td nowrap><b>Domain name <br>Exact Match</b></td>"; $yahooFile .= "<td nowrap><b>Domain name <br>Partial Match</b></td>"; $yahooFile .= "<td nowrap><b>Keyword Density</b></td>"; $yahooFile .= "<td nowrap><b>META Description<br> Exact Match</b></td>"; $yahooFile .= "<td nowrap><b>META Description<br> Partial Match</b></td>"; $yahooFile .= "<td nowrap><b>Header Tags</b></td>"; $yahooFile .= "<td nowrap><b>Header Tag <br>Keywords</b></td>"; $yahooFile .= "<td nowrap width='350'><b>Keyword Positions in Page</b></td>"; $yahooFile .= "<td nowrap><b>Keyword Prominence Map</b></td>"; $yahooFile .= "<td nowrap><b>Outbound Links with Keywords</b></td>"; $yahooFile .= "<td nowrap width='150'><b>Outbound Link<br> PRs</b></td>"; $yahooFile .= "<td nowrap><b>Page Size <br>[bytes]</b></td>"; $yahooFile .= "<td nowrap><b>Words in<br> Page</b></td>"; $yahooFile .= "<td nowrap><b>Website Size</b></td>"; $yahooFile .= "<td nowrap><b>Page Age</b></td>"; $yahooFile .= "</tr>"; for (my $i=0; $i < $numberOfLinesYahoo; $i++) { $yahooFile .= "<tr>"; $yahooFile .= "<td align=left>$i </td>"; $yahooFile .= "<td align=left>$yahooLinks[$i] </td>"; $yahooFile .= "<td align=left>$yahooTitles[$i] </td>"; $yahooFile .= "<td align=left>$yahooRealTitles[$i] </td>"; $yahooFile .= "<td align=left>$yahooKeywordTitleMatch[$i] </td>"; $tmp = sprintf "%.1f", $yahooKeywordTitlePageCopy[$i]; $yahooFile .= "<td align=left>$tmp </td>"; $yahooFile .= "<td align=left>$yahooDomainKeywordExactMatch[$i] </td>"; $yahooFile .= "<td align=left>$yahooDomainKeywordPartialMatch[$i] </td>"; $tmp = sprintf "%.3f", $yahooKeywordDensity[$i]; $yahooFile .= "<td align=left>$tmp </td>"; $yahooFile .= "<td align=left>$yahooDescriptionMetaExact[$i] </td>"; $yahooFile .= "<td align=left>$yahooDescriptionMetaPartial[$i] </td>"; $yahooFile .= "<td align=left>$yahooNumberOfHeaderTags[$i] </td>"; $yahooFile .= "<td align=left>$yahooHeaderTagsKeywords[$i] </td>"; $tmp = $yahooKeywordPositionsList[$i]; $tmp =~ s/|/, /g; $yahooFile .= "<td align=left>$tmp </td>"; $yahooFile .= "<td align=left><a href='./maps/yahoo".$i.".html'>Map</a></td>"; printIndividualKeywordProminenceMap($i, @yahooKeywordPositions, "yahoo"); $yahooFile .= "<td align=left>$yahooOutboundLinkKeywords[$i] </td>"; $yahooFile .= "<td align=left>$yahooOutboundLinksPR[$i] </td>"; $yahooFile .= "<td align=left>$yahooPageSize[$i] </td>"; $yahooFile .= "<td align=left>$yahooWords[$i] </td>"; $yahooFile .= "<td align=left>$yahooResultsWebsiteSizes[$i] </td>"; $yahooFile .= "<td align=left>$yahooPageAge[$i] </td>"; $yahooFile .= "</tr>"; } my $filename = "./report/yahoo.html"; open FILE, ">", "$filename" or die $!; print FILE $yahooFile; close FILE; } # Subroutine: # createBingHTMLReport # Description: # This subroutine creates bing.html file # which summerizes Bing SERP findings # Inputs: # None # Outputs: # Creates bing.html # Returns: # Returns nothing sub createBingHTMLReport { #create summary table first my $bingFile = "<html><head><title>Detailed Summary for Bing</title>"; $bingFile .= "<style>"; $bingFile .= "body, td, tr{font-family: "Trebuchet ms", verdana, sans-serif; font-size:9px;}"; $bingFile .= "b{font-family: "Trebuchet ms", verdana, sans-serif;font-size:10px;}"; $bingFile .= "</style>"; $bingFile .= "</head>"; $bingFile .= "<body><h1>Ranking Report Summary</h1>"; $bingFile .= "<br>"; $bingFile .= "<table border="1" width="500" cellspacing="2" cellpadding="2">"; $bingFile .= "<tr><td colspan=2><b>Averages</b></td>"; $bingFile .= "</tr>"; $bingFile .= "<tr>"; $bingFile .= "<td><b>% Title Match</b></td>"; my $tmp = sprintf "%.1f", $percentMatchTitlesBing; $bingFile .= "<td>$tmp</td>"; $bingFile .= "</tr>"; $bingFile .= "<tr>"; $bingFile .= "<td><b>% Keyword Domain Exact Match</b></td>"; $tmp = sprintf "%.1f", $percentDomainKeywordExactMatchBing; $bingFile .= "<td>$tmp</td>"; $bingFile .= "</tr>"; $bingFile .= "<tr>"; $bingFile .= "<td><b>% Keyword Domain Partial Match</b></td>"; $tmp = sprintf "%.1f", $percentDomainKeywordPartialMatchBing; $bingFile .= "<td>$tmp</td>"; $bingFile .= "</tr>"; $bingFile .= "<tr>"; $bingFile .= "<td><b>% Keyword Density</b></td>"; $tmp = sprintf "%.1f", $bingAvgDensity; $bingFile .= "<td>$tmp</td>"; $bingFile .= "</tr>"; $bingFile .= "<tr>"; $bingFile .= "<td><b>Page Size [bytes]</b></td>"; $tmp = sprintf "%.0f", $bingAvgPageSize; $bingFile .= "<td>$tmp</td>"; $bingFile .= "</tr>"; $bingFile .= "<tr>"; $bingFile .= "<td><b>Words Per Page</b></td>"; $tmp = sprintf "%.0f", $bingWordsPerPage; $bingFile .= "<td>$tmp</td>"; $bingFile .= "</tr>"; $bingFile .= "<tr>"; $bingFile .= "<td><b>Website Size [of base url]</b></td>"; $tmp = round($bingAverageWebSize); $bingFile .= "<td>$tmp</td>"; $bingFile .= "</tr>"; $bingFile .= "</table><br><br>"; $bingFile .= "<b>Detail Table</b> <br>"; $bingFile .= "<table border=1 cellpadding=2 cellspacing=2>"; $bingFile .= "<tr>"; $bingFile .= "<td nowrap>#</td>"; $bingFile .= "<td width='100'><b>URL</b></td>"; $bingFile .= "<td nowrap width='150'><b>Bing Title</b></td>"; $bingFile .= "<td nowrap width='150'><b>Page Title</b></td>"; $bingFile .= "<td nowrap><b>Keyword(s) found<br> in Title? [Y|N]</b></td>"; $bingFile .= "<td nowrap><b>Title Keywords <br>In Page Copy [%]</b></td>"; $bingFile .= "<td nowrap><b>Domain name <br>Exact Match</b></td>"; $bingFile .= "<td nowrap><b>Domain name <br>Partial Match</b></td>"; $bingFile .= "<td nowrap><b>Keyword Density</b></td>"; $bingFile .= "<td nowrap><b>META Description<br> Exact Match</b></td>"; $bingFile .= "<td nowrap><b>META Description<br> Partial Match</b></td>"; $bingFile .= "<td nowrap><b>Header Tags</b></td>"; $bingFile .= "<td nowrap><b>Header Tag <br>Keywords</b></td>"; $bingFile .= "<td nowrap width='350'><b>Keyword Positions in Page</b></td>"; $bingFile .= "<td nowrap><b>Keyword Prominence Map</b></td>"; $bingFile .= "<td nowrap><b>Outbound Links with Keywords</b></td>"; $bingFile .= "<td nowrap width='150'><b>Outbound Link<br> PRs</b></td>"; $bingFile .= "<td nowrap><b>Page Size <br>[bytes]</b></td>"; $bingFile .= "<td nowrap><b>Words in<br> Page</b></td>"; $bingFile .= "<td nowrap><b>Website Size</b></td>"; $bingFile .= "<td nowrap><b>Page Age</b></td>"; $bingFile .= "</tr>"; for (my $i=0; $i < $numberOfLinesBing; $i++) { $bingFile .= "<tr>"; $bingFile .= "<td align=left>$i </td>"; $bingFile .= "<td align=left>$bingLinks[$i] </td>"; $bingFile .= "<td align=left>$bingTitles[$i] </td>"; $bingFile .= "<td align=left>$bingRealTitles[$i] </td>"; $bingFile .= "<td align=left>$bingKeywordTitleMatch[$i] </td>"; $tmp = sprintf "%.1f", $bingKeywordTitlePageCopy[$i]; $bingFile .= "<td align=left>$tmp </td>"; $bingFile .= "<td align=left>$bingDomainKeywordExactMatch[$i] </td>"; $bingFile .= "<td align=left>$bingDomainKeywordPartialMatch[$i] </td>"; $tmp = sprintf "%.3f", $bingKeywordDensity[$i]; $bingFile .= "<td align=left>$tmp </td>"; $bingFile .= "<td align=left>$bingDescriptionMetaExact[$i] </td>"; $bingFile .= "<td align=left>$bingDescriptionMetaPartial[$i] </td>"; $bingFile .= "<td align=left>$bingNumberOfHeaderTags[$i] </td>"; $bingFile .= "<td align=left>$bingHeaderTagsKeywords[$i] </td>"; $tmp = $bingKeywordPositionsList[$i]; $tmp =~ s/|/, /g; $bingFile .= "<td align=left>$tmp </td>"; $bingFile .= "<td align=left><a href='./maps/bing".$i.".html'>Map</a></td>"; printIndividualKeywordProminenceMap($i, @bingKeywordPositions, "bing"); $bingFile .= "<td align=left>$bingOutboundLinkKeywords[$i] </td>"; $bingFile .= "<td align=left>$bingOutboundLinksPR[$i] </td>"; $bingFile .= "<td align=left>$bingPageSize[$i] </td>"; $bingFile .= "<td align=left>$bingWords[$i] </td>"; $bingFile .= "<td align=left>$bingResultsWebsiteSizes[$i] </td>"; $bingFile .= "<td align=left>$bingPageAge[$i] </td>"; $bingFile .= "</tr>"; } my $filename = "./report/bing.html"; open FILE, ">", "$filename" or die $!; print FILE $bingFile; close FILE; } # Subroutine: # createIndexHTML # Description: # This subroutine creates HTML fragment for the index file # looking for last modified string # Inputs: # $keyword => keyword # Outputs: # Creates index.html # Returns: # Returns nothing sub createIndexHTML { my $keyword = shift; my $indexFile = "<html><head><title>Ranking Report Summary</title></head>"; $indexFile .= "<body><center><strong>Ranking Report Summary"; $indexFile .= " (for "$keyword") <br><br>"; $indexFile .= "<a href="#" onclick="document.all.myiframe.src='google.html'">"; $indexFile .= "Google</a> |"; $indexFile .= "<a href="#" onclick="document.all.myiframe.src='yahoo.html'">"; $indexFile .= "Yahoo!</a> |"; $indexFile .= "<a href="#" onclick="document.all.myiframe.src='bing.html'">"; $indexFile .= "Bing Search</a><br><br>"; $indexFile .= "Click on Links to View Summary..<br><br>"; $indexFile .= "<iframe name="myiframe" width=5000 height=6000 border="0" frameborder="0">"; $indexFile .= "</iframe></center></body></html>"; my $filename = "./report/index.html"; open FILE, ">", "$filename" or die $!; print FILE $indexFile; close FILE; } # Subroutine: # pageAgeAnalysis # Description: # This subroutine scrapes all URLs found on SERPs # looking for last modified string # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to links array # $srcArr => array (reference) to links array # Outputs: # none # Returns: # Returns nothing sub pageAgeAnalysis { my ($numberOfElements, $srcArr, $destArr) = @_; for(my $i=0; $i<$numberOfElements; $i++) { #print " processing: $srcArr->[$i]"; my $ua = new LWP::UserAgent; $ua->agent("Mozilla/3.0 (compatible)"); my $request = new HTTP::Request("GET", "$srcArr->[$i]"); my $response = $ua->request($request); my $code=$response->code; $destArr->[$i]= scalar(localtime($response->last_modified)), #print " $destArr->[$i]"; } } # Subroutine: # analyzeWebsiteSize # Description: # This subroutine scrapes Google SERPs to pick up size of # different websites # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to links array # $srcArr => array (reference) to links array # Outputs: # none # Returns: # Returns average site size sub analyzeWebsiteSize { my ($numberOfElements, $srcArr, $destArr) = @_; # compose "site:" links my $ua = new LWP::UserAgent; my $res; $ua->timeout(25); $ua->agent("Mozilla/3.0 (compatible)"); my $total = 0; for($i=0; $i<$numberOfElements; $i++){ my $filename = "./serptemp/temp.txt"; my $url = $srcArr->[$i]; #let's get the base URL first if($url =~ /^http/) { my @tmparr1 = split (////,$url); my @tmparr2 = split (///,$tmparr1[1]); my $baseurl = ""; if($#tmparr2>0) { $baseurl = $tmparr2[0]; }else { $baseurl = $tmparr1[1]; } $baseurl =~ s//$//; $url = $baseurl; } my $tmpurl = 'http://www.google.com/search?hl=en&q=site%3A' . $url . '&btnG=Search'; my $randNum = int(rand(5)); #print " Sleeping for $randNum seconds. "; sleep($randNum); $res = $ua->get("$tmpurl",':content_file' => "$filename"); #get the google SERP pagecopy variable my $pageCopy = ""; if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); #get pageCopy for this file while (my $token = $p->get_tag("body")) { $pageCopy = $p->get_text("/body"); } }else { print " file does not exist"; } #break it up with "of about <b>" my $separator1 = 'of about '; my @tempArr1 = split(/$separator1/, $pageCopy); my $separator2 = 'b'; my @tempArr2 = split(/$separator2/, $tempArr1[1]); my $separator3 = ' for'; my @tempArr3 = split(/$separator3/, $tempArr2[0]); my $size = $tempArr3[0]; #remove comma in the number $size =~ s/,//g; # store it for that URL $destArr->[$i] = $size; $total = $total + $size; } #calculate and return the average if ($total>0) { return ($total/$numberOfElements); } else { return 0; } } # Subroutine: # optimumWordsPerPage # Description: # This subroutine loops through all files to record # page sizes in destination array. # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to links array # $prefix => SE file prefix # Outputs: # none # Returns: # Returns average words per page size sub optimumWordsPerPage { my ($numberOfElements, $destArr, $prefix) = @_; my $total = 0; for(my $i=0; $i< $numberOfElements; $i++) { my $filename = './serptemp/' . $prefix . "$i.txt"; my $tree = HTML::TreeBuilder->new; $tree->parse_file("$filename"); my $non_html = $tree->as_text(); $non_html =~ s/^s+/ /g; my @tempsizearr = split(/ /,$non_html); $destArr->[$i]= $#tempsizearr; $total = $total + $#tempsizearr; } return ($total/$numberOfElements); } # Subroutine: # averagePageSize # Description: # This subroutine loops through all files to record # page sizes in destination array. # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to links array # $prefix => SE file prefix # Outputs: # none # Returns: # Returns average page size sub averagePageSize { my ($numberOfElements, $destArr, $prefix) = @_; my $total = 0; for(my $i=0; $i< $numberOfElements; $i++) { my $filename = './serptemp/' . $prefix . "$i.txt"; my $filesize = -s "$filename"; $destArr->[$i] = $filesize; $total = $total + $destArr->[$i]; } return ($total/$numberOfElements); } # Subroutine: # outboundLinkPRAnalysis # Description: # This subroutine parses PR values from root domains # of all outbound links # Inputs: # $numberOfElements => number of files to process # $srcLinksArr => array (reference) to links array # $prefix => SE file prefix # Outputs: # prints the keyword map # Returns: # No returns sub outboundLinkPRAnalysis { my ($numberOfElements, $srcLinksArr, $destArr, $prefix) = @_; my $PRURL = 'http://www.seowarrior.net/scripts/pr.php?pr='; my $range = 2; #loop through each file for(my $i=0; $i< $numberOfElements; $i++) { my $filename = './serptemp/' . $prefix . "$i.txt"; my %linkHash = (); my $PRs = ""; #check for file existence if (-e "$filename") { my $p = HTML::TokeParser->new($filename); while (my $token = $p->get_tag("a")) { #get link and anchor text my $url = $token->[1]{href} || "-"; my $text = $p->get_trimmed_text("/a"); #check if link internal or external if($url =~ /^http/) { my @tmparr1 = split (////,$url); my @tmparr2 = split (/./,$tmparr1[1]); my $tmpbaseURLChild = $tmparr2[0] . $tmparr2[1]; my @tmparr3 = split (////,$srcLinksArr->[$i]); my @tmparr4 = split (/./,$tmparr3[1]); my $tmpbaseURLParent = $tmparr4[0] . $tmparr4[1]; my @tmparr5 = split (///,$tmparr1[1]); my $baseurl = ""; if($#tmparr5>0) { $baseurl = $tmparr5[0]; }else { $baseurl = $tmparr1[1]; } $baseurl =~ s//$//; if($tmpbaseURLChild ne $tmpbaseURLParent) { #working with external link if( !(exists $linkHash{$baseurl}) ){ #obtain PR value / use random sleep my $randNum = int(rand($range)); #print " Sleeping for $randNum seconds. "; sleep($randNum); my $tmpurl = $PRURL . $baseurl; my $PR = get $tmpurl; #print "$PR:"; $PR =~ s/ //g; $PRs = $PRs . $PR . "|"; $linkHash{$baseurl} = 1; } } } } } else { #print " Filename: $filename not found!"; } $destArr->[$i] = $PRs; #print " $PRs"; } } # Subroutine: # outboundLinkKeywordAnalysis # Description: # This subroutine analyzes keywords in outbound links # Inputs: # $numberOfElements => number of files to process # $srcLinksArr => array (reference) to links array # $prefix => SE file prefix # $keyword => keyword # Outputs: # prints the keyword map # Returns: # No returns sub outboundLinkKeywordAnalysis { my ($numberOfElements, $srcLinksArr, $destArr, $prefix, $keyword) = @_; my @keywordFragments = split(/ /,$keyword); #loop through each file for(my $i=0; $i< $numberOfElements; $i++) { my $filename = './serptemp/' . $prefix . "$i.txt"; my $keywordMatchPercent = ""; my $foundCount = 0; my $total = 0; #check for file existence if (-e "$filename") { my $p = HTML::TokeParser->new($filename); while (my $token = $p->get_tag("a")) { #get link and anchor text my $url = $token->[1]{href} || "-"; my $text = $p->get_trimmed_text("/a"); $text =~ s/"//; $text =~ s/'//; #check if link internal or external if($url =~ /^http/) { @tmparr1 = split (////,$url); @tmparr2 = split (/./,$tmparr1[1]); $tmpbaseURLChild = $tmparr2[0] . $tmparr2[1]; @tmparr3 = split (////,$srcLinksArr->[$i]); @tmparr4 = split (/./,$tmparr3[1]); $tmpbaseURLParent = $tmparr4[0] . $tmparr4[1]; if($tmpbaseURLChild ne $tmpbaseURLParent) { #external link..process it if($#keywordFragments >0){ #handle multi keywords for(my $j=0; $j <= $#keywordFragments; $j++){ #check for a match if($text =~ /$keywordFragments[$j]/i) { #match found $foundCount++; last; } } } else { if($text =~ /$keyword/i) { #match found $foundCount++; } } } } $total++; } } else { #print " Filename: $filename not found!"; } if($total>0) { $destArr->[$i] = ( $foundCount); } else { $destArr->[$i] = 0; } #print " $destArr->[$i]"; } } # Subroutine: # printKeywordProminenceMap # Description: # This subroutine prints each URL map # Inputs: # $numberOfElements => number of files to process # $srcArr => array (reference) to result array # Outputs: # prints the keyword map # Returns: # No returns sub printKeywordProminenceMap { my ($srcArr, $numberOfElements) = @_; for(my $i; $i<$numberOfElements; $i++){ print "$srcArr->[$index] "; } } # Subroutine: # printIndividualKeywordProminenceMap # Description: # This subroutine prints each URL map # Inputs: # $numberOfElements => number of files to process # $srcArr => array (reference) to result array # Outputs: # prints the keyword map # Returns: # No returns sub printIndividualKeywordProminenceMap { my ($index, $srcArr, $prefix) = @_; my $filename = "./report/maps/$prefix".$index.".html"; open FILE, ">", "$filename" or die $!; print FILE "<html><head><title> "; print FILE "Keyword Prominence Map "; print FILE "</title></head> "; print FILE "<body><table width=400 cellpading=2 cellspacing=0><tr><td width=400>"; print FILE $srcArr->[$index]; print FILE "</td></tr></table></body></html>"; close FILE; } # Subroutine: # analyzeKeywordPositions # Description: # This subroutine analyzes relative positions of keywords within a page copy # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to result array # $keyword => keyword to analyze # $prefix => file prefix # Outputs: # No outputs produced # Returns: # No returns all work done on arrays sub analyzeKeywordPositions { my ($numberOfElements, $destArr, $destArr2, $prefix, $keyword) = @_; my @keywordFragments = split(/ /,$keyword); #loop through each file to get for(my $i=0; $i< $numberOfElements; $i++) { my $pageCopy = ""; my $tmpMap = ":"; my $filename = './serptemp/' . $prefix . "$i.txt"; #check for file existence if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); #get pageCopy for this file while (my $token = $p->get_tag("body")) { $pageCopy = $p->get_trimmed_text("/body"); $pageCopy = cleanText($pageCopy); } $pageCopy =~ s/s+/ /g; my @tempArr = split(/ /, $pageCopy); $totalWords = $#tempArr; #print " total words for this page: $totalWords"; #loop through all words for(my $j=0; $j < $totalWords; $j++){ my $flag = "N"; if($#keywordFragments >0){ #handle multi keywords for(my $k=0; $k <= $#keywordFragments; $k++){ #check for a match if($tempArr[$j] =~ /$keywordFragments[$k]/i) { #update destination variable with index of keyword array $destArr->[$i] .= "$k "; #update destination variable with relative positionposition $destArr2->[$i] = $destArr2->[$i] ."$j" . "|"; $flag = "Y"; last; } else { if( ($k == $#keywordFragments) && ($flag ne "Y") ) { $destArr->[$i] .= "* "; } } } } else { #handle single keyword $tempArr[$j] =~ s/"//; $tempArr[$j] =~ s/'//; if($tempArr[$j] =~ /$keyword/i){ $destArr->[$i] .= "0 "; $destArr2->[$i] = $destArr2->[$i] . "$j" . "|"; $flag = "Y"; } else { $destArr->[$i] .= "* "; } } if($flag ne "N") { $destArr->[$i] .= "* "; } } #print " $destArr->[$i]"; } else { print " file does not exist"; } } } # Subroutine: # checkHeaderTags # Description: # This subroutine checks use of heading tags in addition to checking # for keyword use in the same tags. # Inputs: # $numberOfElements => number of files to process # $destArr1 => array (reference) to result array # $destArr2 => array (reference) to result array # $keyword => keyword to analyze # $prefix => file prefix # Outputs: # No outputs produced # Returns: # No returns all work done on arrays sub checkHeaderTags { my ($numberOfElements, $destArr1, $destArr2, $prefix, $keyword) = @_; my @keywordFragments = split(/ /,$keyword); for(my $i=0; $i < $numberOfElements; $i++) { my $filename = './serptemp/' . $prefix . "$i.txt"; if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); my $h1Text = ""; my $h2Text = ""; my $h3Text = ""; my $h4Text = ""; my $h5Text = ""; my $h6Text = ""; my $separator = '|s|e|p|a|r|a|t|o|r'; while(my $token = $p->get_token) { if($token->[0] eq 'S' and $token->[1] eq 'h1') { $h1Text = $h1Text . $separator . $p->get_text("/h1"); } if($token->[0] eq 'S' and $token->[1] eq 'h2') { $h2Text = $h2Text . $separatpr . $p->get_text("/h2"); } if($token->[0] eq 'S' and $token->[1] eq 'h3') { $h3Text = $h3Text . $separator . $p->get_text("/h3"); } if($token->[0] eq 'S' and $token->[1] eq 'h4') { $h4Text = $h4Text . $separator . $p->get_text("/h4"); } if($token->[0] eq 'S' and $token->[1] eq 'h5') { $h5Text = $h5Text . $separator . $p->get_text("/h5"); } if($token->[0] eq 'S' and $token->[1] eq 'h6') { $h6Text = $h6Text . $separator . $p->get_text("/h6"); } } $h1Text = cleanText($h1Text); $h2Text = cleanText($h2Text); $h3Text = cleanText($h3Text); $h4Text = cleanText($h4Text); $h5Text = cleanText($h5Text); $h6Text = cleanText($h6Text); my @h1Arr = split($separator, $h1Text); my @h2Arr = split($separator, $h2Text); my @h3Arr = split($separator, $h3Text); my @h4Arr = split($separator, $h4Text); my @h5Arr = split($separator, $h5Text); my @h6Arr = split($separator, $h6Text); my $h1Cnt = ($#h1Arr == −1) ? 0 : $#h1Arr; my $h2Cnt = ($#h2Arr == −1) ? 0 : $#h2Arr; my $h3Cnt = ($#h3Arr == −1) ? 0 : $#h3Arr; my $h4Cnt = ($#h4Arr == −1) ? 0 : $#h4Arr; my $h5Cnt = ($#h5Arr == −1) ? 0 : $#h5Arr; my $h6Cnt = ($#h6Arr == −1) ? 0 : $#h6Arr; my $h1Flag = "N"; my $h2Flag = "N"; my $h3Flag = "N"; my $h4Flag = "N"; my $h5Flag = "N"; my $h6Flag = "N"; $destArr1->[$i] = "".$h1Cnt."|".$h2Cnt."|".$h3Cnt."|".$h4Cnt."|".$h5Cnt."|".$h6Cnt; if($#keywordFragments > 0) { #handle multi keywords for(my $j=0; $j<=$#keywordFragments; $j++) { if( $keywordFragments[$j] =~ /$h1Text/i ) { $h1Flag = "Y"; } if( $keywordFragments[$j] =~ /$h2Text/i ) { $h2Flag = "Y"; } if( $keywordFragments[$j] =~ /$h3Text/i ) { $h3Flag = "Y"; } if( $keywordFragments[$j] =~ /$h4Text/i ) { $h4Flag = "Y"; } if( $keywordFragments[$j] =~ /$h5Text/i ) { $h5Flag = "Y"; } if( $keywordFragments[$j] =~ /$h6Text/i ) { $h6Flag = "Y"; } } } else { #handle keyword if($keyword =~ /$h1Text/i) { $h1Flag = "Y"; } if($keyword =~ /$h2Text/i) { $h2Flag = "Y"; } if($keyword =~ /$h3Text/i) { $h3Flag = "Y"; } if($keyword =~ /$h4Text/i) { $h4Flag = "Y"; } if($keyword =~ /$h5Text/i) { $h5Flag = "Y"; } if($keyword =~ /$h6Text/i) { $h6Flag = "Y"; } } $destArr2->[$i] = "".$h1Flag."|".$h2Flag."|".$h3Flag."|".$h4Flag."|".$h5Flag."|".$h6Flag; } else { # no file =>insert defaults; $destArr1->[$i] = "0|0|0|0|0|0|"; $destArr2->[$i] = "N|N|N|N|N|N|"; } #print " ".$destArr1->[$i]." ".$destArr2->[$i]; } } # Subroutine: # checkExactDescriptionMeta # Description: # This subroutine checks for exact keyword match in keyword description. # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to result array # $keyword => keyword to analyze # $prefix => file prefix # Outputs: # No outputs produced # Returns: # No returns all work done on array sub checkExactDescriptionMeta { my ($numberOfElements, $destArr, $keyword, $prefix) = @_; for(my $i=0; $i<$numberOfElements; $i++){ $filename = './serptemp/' . $prefix . "$i.txt"; if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); while (my $token=$p->get_tag("meta")) { if ($token->[1]{name}=~/description/i) { my $metaDescription = $token->[1]{content}; $metaDescription =~ s/"//; $metaDescription =~ s/'//; if($metaDescription =~ /$keyword/i) { $destArr->[$i] = "Y"; } else { $destArr->[$i] = "N"; } } } } if ( !(exists $destArr->[$i])) { $destArr->[$i] = "N"; } } } # Subroutine: # checkExactDescriptionMeta # Description: # This subroutine checks for exact keyword match in keyword description. # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to result array # $keyword => keyword to analyze # $prefix => file prefix # Outputs: # No outputs produced # Returns: # No returns all work done on array sub checkPartialDescriptionMeta { my ($numberOfElements, $destArr, $keyword, $prefix) = @_; my @keywordFragments = split(/ /, $keyword); for(my $i=0; $i<$numberOfElements; $i++){ $filename = './serptemp/' . $prefix . "$i.txt"; if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); while (my $token=$p->get_tag("meta")) { if ($token->[1]{name}=~/description/i) { my $metaDescription = $token->[1]{content}; if($#keywordFragments >0) { for (my $j=0; $j<=$#keywordFragments; $j++){ if($metaDescription =~ /$keywordFragments[$j]/i) { $destArr->[$i] = "Y"; last; } else { $destArr->[$i] = "N"; } } } else { if($metaDescription =~ /$keyword/i) { $destArr->[$i] = "Y"; last; } else { $destArr->[$i] = "N"; } } } } } if ( !(exists $destArr->[$i])) { $destArr->[$i] = "N"; } } } # Subroutine: # keywordDensity # Description: # This subroutine calculates keyword density for given keyword. # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to result array # $keyword => keyword to analyze # $prefix => file prefix # Outputs: # No outputs produced # Returns: # No returns all work done on array sub keywordDensity { my ($numberOfElements, $keyword, $destArr, $prefix) = @_; my $total = 0; #loop through all files for(my $i=0; $i<$numberOfElements; $i++) { my $pageCopy = ""; my $filename = './serptemp/' . $prefix . "$i.txt"; if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); while (my $token = $p->get_tag("body")) { $pageCopy = $p->get_trimmed_text("/body"); } } else { print " File not found when calculating keyword density."; } #compare copy and array (sep function) $pageCopy =~ s/"//g; $pageCopy =~ s/'//g; $total = $total + calculateKD($i, $pageCopy, $destArr, $keyword); } return ($total/$numberOfElements); } # Subroutine: # calcualteKD # Description: # Helper subroutine to calculate keyword density # Inputs: # $numberOfElements => number of files to process # $destArr => array (reference) to result array # $keyword => keyword to analyze # $prefix => file prefix # Outputs: # No outputs produced # Returns: # No returns all work done on array sub calculateKD { my ($index, $pageCopy, $destArr, $keyword) = @_; my @keywordFragments = split (/ /,$keyword); if ($#keywordFragments>0) { for (my $i=0; $i<= $#keywordFragments; $i++){ my @tempArr = split(/$keywordFragments[$i]/,$pageCopy); my @tempArr2 = split(/ /, $pageCopy); if( ($#tempArr == −1) || ($#tempArr2 == −1)) { $destArr->[$index] = 0; }else { $destArr->[$index] = $destArr->[$index] + ($#tempArr/$#tempArr2)*100; } } return $destArr->[$index]; } else { my @tempArr = split(/$keyword/,$pageCopy); my @tempArr2 = split(/ /, $pageCopy); $destArr->[$index] = ($#tempArr/$#tempArr2)*100; return $destArr->[$index]; } } # Subroutine: # keywordDomainExactMatch # Description: # This subroutine analyzes keywords in domain names. It looks # to see if keyword is part of the domain name. # Possible improvement could also consider keyword stemming. # Inputs: # $numberOfElements => number of files to process # $linksArr => array (reference) to links array # $destArr => array (reference) to result array # $keyword => file prefix for the three SEs # Outputs: # No outputs produced # Returns: # No returns all work done on passed array sub keywordDomainExactMatch { my ($keyword, $linksArr, $numberOfElements, $destArr) = @_; my $matchCnt=0; my @keywordFragments = split(/ /, $keyword); my $numberOfKeywordFragments = $#keywordFragments; my $total = 0; for (my $i=0; $i<=$numberOfElements; $i++) { $matchCnt=0; my $tmp = $linksArr->[$i]; $tmp =~ s/^http:////g; $tmp =~ s/^https:////g; my @linkFragments = split(///,$tmp); my $link = $linkFragments[0]; if($numberOfKeywordFragments>0) { for(my $j=0; $j<=$numberOfKeywordFragments; $j++) { if ($link =~ /$keywordFragments[$j]/i) { $matchCnt++; } } } else { if($link =~ /$keyword/i) { $matchCnt++; } } if($matchCnt>0) { if($numberOfKeywordFragments>0) { if($matchCnt == ($numberOfKeywordFragments+1)) { $destArr->[$i] = "Y"; } else { $destArr->[$i] = "N"; } } else { # single keyword $destArr->[$i] = "Y"; } } else { $destArr->[$i] = "N"; } if($destArr->[$i] eq "Y") { $total++; } } return ( ($total/$numberOfElements)* 100); } # Subroutine: # keywordDomainPartialMatch # Description: # This subroutine analyzes keywords in domain names. It looks # for partial matche between the keyword and the domain name. # Inputs: # $numberOfElements => number of files to process # $linksArr => array (reference) to links array # $destArr => array (reference) to result array # $keyword => file prefix for the three SEs # Outputs: # No outputs produced # Returns: # No returns all work done on passed array sub keywordDomainPartialMatch { my ($keyword, $linksArr, $numberOfElements, $destArr) = @_; my $totalNumber = $numberOfElements; my $matchCnt=0; my @keywordFragments = split (/ /, $keyword); my $numOfKeywordFragments = $#keywordFragments; my $keywordHyphen = $keyword; my $keywordUnderscore = $keyword; my $keywordNoSpace = $keyword; $keywordHyphen =~ s/ /-/g; $keywordNoSpace =~ s/ //g; #loop through all links if($numOfKeywordFragments >0) { for(my $i=0; $i<$numberOfElements; $i++) { my $tmp = $linksArr->[$i]; $tmp =~ s/^http:////gi; $tmp =~ s/^https:////gi; my @linkFragments = split(///,$tmp); my $link = $linkFragments[0]; for(my $j=0; $j<=$numOfKeywordFragments; $j++) { if($link =~ /$keywordFragments[$j]/i) { $destArr->[$i] = "Y"; $j = $numOfKeywordFragments; $matchCnt++; } else { $destArr->[$i] = "N"; } } } } else { for(my $i=0; $i<$numberOfElements; $i++) { my $tmp = $linksArr->[$i]; $tmp =~ s/^http:////g; $tmp =~ s/^https:////g; my @linkFragments = split(///,$tmp); my $link = $linkFragments[0]; if( ($link =~ /$keyword/) || ($link =~ /$keywordHyphen/) || ($link =~ /$keywordNoSpace/) ) { $destArr->[$i] = "Y"; $matchCnt++; } else { $destArr->[$i] = "N"; } } } return ( ($matchCnt/$totalNumber)* 100); } # Subroutine: # compareTitlePageCopy # Description: # This subroutine compares page title to page copy # Inputs: # $numberOfElements => number of files to process # $titlesArr => array (reference) to titles array # $destArr => array (reference) to result array # $prefix => file prefix for the three SEs # Outputs: # No outputs produced # Returns: # No returns all work done on passed arrays sub compareTitlePageCopy { my ($numberOfElements, $titlesArr, $destArr, $prefix) = @_; #loop through all files for(my $i=0; $i<=$numberOfElements; $i++) { #split up current title into token words my $title = $titlesArr->[$i]; $title = cleanText($title); $title =~ s/'//g; $title =~ s/"//g; my @titleFragments = split(/ /,$title); #get copy of each file my $pageCopy = ""; my $filename = './serptemp/' . $prefix . "$i.txt"; if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); while (my $token = $p->get_tag("body")) { $pageCopy = $p->get_trimmed_text("/body"); $pageCopy =~ s/'//g; $pageCopy =~ s/"//g; last; } } #compare copy and array (sep function) compareTitlePageCopyHelper($i, $#titleFragments, @titleFragments, $pageCopy, $destArr); } } # Subroutine: # compareTitlePageCopyHelper # Description: # This subroutine is used by compareTitlePageCopy subroutine # to compare page title to page copy # Inputs: # $index => represents numerical index of the array # $numberOfElements => number of files to process # $titleFragments => array (reference) to title fragments array # $pageCopy => page copy text # $pageCopyTitleArr => array (reference) to resulting array # Outputs: # No outputs produced # Returns: # No returns all work done on passed arrays sub compareTitlePageCopyHelper { my ($index, $numberOfElements, $titleFragments, $pageCopy, $pageCopyTitleArr) = @_; my $foundCnt = 0; my $totalTitleFragments = $numberOfElements; for(my $j=0; $j<=$numberOfElements; $j++) { my $tmpfragment = $titleFragments->[$j]; if( $pageCopy =~ /$tmpfragment/i ){ $foundCnt++; } } if($foundCnt == 0){ $pageCopyTitleArr->[$index] = 0; } else { $pageCopyTitleArr->[$index] = ( ($foundCnt/($totalTitleFragments+1)) * 100); } } # Subroutine: # compareArrays # Description: # This subroutine compares elements of two arrays to see if they # are found in each other. # Inputs: # $numberOfElements => number of files to process # $realArr => array (reference) to first source array # $foundArr => array (reference) to second source array # $destArr => array (reference) to result array # Outputs: # No outputs produced # Returns: # Subroutine returns percentage of found matches sub compareArrays { my ($numOfElements, $realArr, $foundArr, $destArr) = @_; my $found = 0; my $percentMatch = 0; for(my $i=0; $i<$numOfElements; $i++){ $tmpVar = $foundArr->[$i]; $tmpVar =~ s/(/\(/g; $tmpVar =~ s/)/\)/g; $tmpVar =~ s/-/\-/g; $tmpVar =~ s/+/\+/g; $tmpVar =~ s/$/\$/g; $tmpVar =~ s/^/\^/g; $tmpVar =~ s/[/\[/g; $tmpVar =~ s/]/\]/g; $tmpVar =~ s/}/\}/g; $tmpVar =~ s/{/\{/g; if ($realArr->[$i] =~ /$tmpVar/i) { $destArr[$i] = "Y"; $found++; }else { $destArr[$i] = "N"; } } return ( ($found/$numOfElements)*100); } # Subroutine: # getRealTitles # Description: # This subroutine retrieves actual titles # Inputs: # $numberOfElements => number of files to process # $titlesArr => array (reference) to array that will contain real titles # $prefix => prefix of file name to be used # Outputs: # No outputs produced # Returns: # Subroutine operates on array already defined outside the routine. # Subroutine returns nothing. sub getRealTitles { my ($numberOfElements, $titlesArr, $prefix) = @_; for(my $i=0; $i<$numberOfElements; $i++){ $filename = './serptemp/' . $prefix . "$i.txt"; if (-e "$filename"){ my $p = HTML::TokeParser->new($filename); while (my $token = $p->get_token) { if ($token->[0] eq "S" and lc $token->[1] eq 'title') { my $title = $p->get_text() || "not found"; $title =~ s/^s+//; $title =~ s/s+$//; $titlesArr->[$i]=$title; last; } } }else { $titlesArr->[$i]="not found"; } } } # Subroutine: # getKeywordsTitleMatch # Description: # This subroutine compares given keyword with entires of array # while setting third array with results of this comparison # Inputs: # $keyword => keyword or keyphrase to do analysis on # $sourceArr => array (reference) to be used for comparisons # $numOfElements => size of referred array # $destArr => array (reference) that will contain compariosn results # Outputs: # No outputs produced # Returns: # Subroutine operates on array already defined outside the routine. # Subroutine returns nothing. sub getKeywordsTitleMatch { my ($keyword, $sourceArr, $numOfElements, $destArr) = @_; $keyword = cleanText($keyword); $keyword =~ s/'//g; $keyword =~ s/"//g; @keywordFragments = split(/ /, $keyword); my $numberOfKeywordTokens = $#keywordFragments; for(my $i=0; $i<= $numOfElements; $i++) { my $tmp = $sourceArr->[$i]; $tmp = cleanText($tmp); $tmp =~ s/'//; $tmp =~ s/"//; my $foundCnt = 0; if ($numberOfKeywordTokens >0) { for(my $j=0; $j<=$#keywordFragments; $j++){ if ($tmp =~ /$keywordFragments[$j]/i) { $foundCnt++; last; } } } else { if ($tmp =~ /$keyword/i) { $foundCnt++; } } if($foundCnt > 0) { $destArr->[$i] = "Y"; } else { $destArr->[$i] = "N"; } } } # Subroutine: # initializeKeyVariables # Description: # Main purpose is to setup link and title arrays that are # to be used throughout the script. # Inputs: # $keyword => keyword or keyphrase to do analysis on # $googleLinksArr => array (reference) containing Google links # $googleTitlesArr => array (reference) containing Google titles # $yahooLinksArr => array (reference) containing Yahoo! links # $yahooTitlesArr => array (reference) containing Yahoo! titles # $bingLinksArr => array (reference) containing Bing links # $bingTitlesArr => array (reference) containing Bing titles # Outputs: # No outputs produced # Returns: # Subroutine operates on arrays already defined outside the routine. # Subroutine returns nothing. sub initializeKeyVariables { my ($keyword, $googleLinksArr,$googleTitlesArr, $yahooLinksArr, $yahooTitlesArr, $bingLinksArr, $bingTitlesArr) = @_; #create user agents my $uaGoogle = new LWP::UserAgent; my $uaYahoo = new LWP::UserAgent; my $uaBing = new LWP::UserAgent; #setup time out to 25 seconds $uaGoogle->timeout(25); $uaYahoo->timeout(25); $uaBing->timeout(25); #setup user agent my $useragent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"; $uaGoogle->agent("$useragent"); $uaYahoo->agent("$useragent"); $uaBing->agent("$useragent"); #setup & get one hundred results for each SE my $gurl= "http://www.google.com/search?num=$numres&hl=en&safe=off&q=$keyword&sa=N"; my $yurl= "http://search.yahoo.com/search?p=$keyword&ei=UTF-8&fr=sfp&n=$numres&b=1"; my $lurl= "http://search.bing.com/results.aspx?q=$keyword&first=1&count=$numres&"; my $reqGoogle = new HTTP::Request GET => "$gurl"; my $reqYahoo = new HTTP::Request GET => "$yurl"; my $reqBing = new HTTP::Request GET => "$lurl"; my $resGoogle = $uaGoogle->request($reqGoogle); my $resYahoo = $uaYahoo->request($reqYahoo); my $resBing = $uaBing->request($reqBing); #assign SERPs to special variables my $ghtml = $resGoogle->content; my $yhtml = $resYahoo->content; my $lhtml = $resBing->content; #get links for each serp my $streamGoogle = HTML::TokeParser->new($ghtml); my $streamYahoo = HTML::TokeParser->new($yhtml); my $streamBing = HTML::TokeParser->new($lhtml); # process google links my $cnt=0; my $threeDots = '...'; while (my $token = $streamGoogle->get_token) { if ($token->[0] eq 'S' && $token->[1] eq 'a') { if( ($token->[2]{'href'} !~ /cache/i) && !($token->[2]{'href'} !~ /^http/i) && ($token->[2]{'href'} !~ /^https/i) && ($token->[2]{'href'} !~ /google/i) && ($token->[2]{'href'} !~ /aclk/i) && ($token->[2]{'href'} !~ /youtube/i)&& ($token->[2]{'href'} !~ /wikipedia/i) ) { $googleLinksArr->[$cnt] = $token->[2]{'href'}; $googleTitlesArr->[$cnt] = $streamGoogle->get_trimmed_text("/a"); $googleTitlesArr->[$cnt] =~ s/$threeDots$//; $cnt++; } } } # process yahoo links my $cnt2=0; while (my $token = $streamYahoo->get_token) { if ($token->[0] eq 'S' && $token->[1] eq 'a') { @tmpurl= split (/**/, $token->[2]{'href'}); $tmpurl[1] =~ s/%3f/?/g; $tmpurl[1] =~ s/%26/&/g; if( ($tmpurl[1] !~ /cache/i) && ($tmpurl[1] !~ /^https/i) && ($tmpurl[1] !~ /yahoo/i) && ($tmpurl[1] !~ /wikipedia/i) && ($tmpurl[1] !~ /overture/i) ){ $tmpurl[1] =~ s/%3a/:/g; $tmpurl[1] =~ s/^s+//g; if( $tmpurl[1] ne "") { $yahooLinksArr->[$cnt2] = $tmpurl[1]; $yahooTitlesArr->[$cnt2] = $streamYahoo->get_trimmed_text("/a"); $yahooTitlesArr->[$cnt2] =~ s/$threeDots$//; $cnt2++; } } } } # process bing links my $cnt3=0; while (my $token = $streamBing->get_token) { if ($token->[0] eq 'S' && $token->[1] eq 'a') { if( !($token->[2]{'href'} !~ /^http/i) && ($token->[2]{'href'} !~ /^https/i) && ($token->[2]{'href'} !~ /cache/i) && ($token->[2]{'href'} !~ /wikipedia/i) && ($token->[2]{'href'} !~ /msn/i) && ($token->[2]{'href'} !~ /hotmail/i) && ($token->[2]{'href'} !~ /microsoft/i) && ($token->[2]{'href'} !~ /bing.com/i) ) { $token->[2]{'href'} =~ s/^s+//g; if($token->[2]{'href'} ne "") { $bingLinksArr->[$cnt3] = $token->[2]{'href'}; $bingTitlesArr->[$cnt3] = $streamBing->get_trimmed_text("/a"); $bingTitlesArr->[$cnt3] =~ s/$threeDots$//; $cnt3++; } } } } } # Subroutine: # getSERPResults # Description: # This subroutine downloads htmls of all urls specified # in the array referenced by $urlArr # Inputs: # $numberOfElements => size of referred array # $urlArr => array (reference) containing urls to process # $name => prefix of file name to be used # Outputs: # text files contain html from downloaded links # Returns: # Subroutine operates on array already defined outside the routine. # Subroutine returns nothing. sub getSERPResults { my ($numberOfElements, $urlArr, $name) = @_; my $ua = new LWP::UserAgent; my $res; $ua->timeout(25); $ua->agent("My Crawler"); for($i=0;$i<$numberOfElements;$i++){ $filename = "./serptemp/". $name . $i . ".txt"; $res = $ua->get("$urlArr->[$i]",':content_file' => "$filename"); } } # Subroutine: # cleanText # Description: # This is a utility subroutine to clean HTML fragments. # Inputs: # $text => content of text to clean # Outputs: # No outputs produced # Returns: # No returns; all work done on passed array sub cleanText { my $text = shift; $text =~ s/(/ /g; $text =~ s/)/ /g; $text =~ s/[/ /g; $text =~ s/]/ /g; $text =~ s/./ /g; $text =~ s/-/ /g; $text =~ s/=/ /g; $text =~ s/|/ /g; $text =~ s/!/ /g; $text =~ s/,/ /g; $text =~ s/?/ /g; $text =~ s/^/ /g; $text =~ s/:/ /g; $text =~ s/;/ /g; $text =~ s/&/ /g; $text =~ s/*/ /g; $text =~ s/$/ /g; $text =~ s/s+/ /g; return $text; }
#!/usr/local/bin/perl ##################################################################### # File: linkchecker.pl # # Description: Check Links Script # # Usage: perl linkchecker.pl http://somedomain.net > report.csv # ##################################################################### use WWW::Mechanize; use LWP::Simple; my $baseurl = shift; my @url=(); my @level=(); my @type=(); my @title=(); my @status=(); my @page=(); my %uniqueURL=(); my %checkedURL=(); my $masterCnt=0; my $masterLevel=1; $mech = WWW::Mechanize->new(); #### Processing Level One $mech->get( $baseurl ); @links = $mech->links(); foreach $link (@links) { $tmpurl = $baseurl . '/' . $link->url(); if ( ($link->url() !~ /mailto/i) && ($link->url() !~ /javascript/i ) ) { if ($link->url() !~ /^http/) { #collect unique URL $uniqueURL{$tmpurl}=$link->text(); $url[$masterCnt]=$tmpurl; $type[$masterCnt]= "relative"; }else { $tmpurl = $link->url(); $uniqueURL{$link->url()}=$link->text(); $url[$masterCnt]=$link->url(); if( $link->url() =~ /$baseurl/ ){ $type[$masterCnt]= "absolute internal"; }else { $type[$masterCnt]= "outbound"; } } $level[$masterCnt]=$masterLevel; $title[$masterCnt]=$link->text(); $page[$masterCnt]=$baseurl; $masterCnt++; } } $masterLevel++; $linksOnFirstLevel=$masterCnt; ####Processing Level Two %levTwoURLs = (); $masterCnt = processSubLevel(2, $masterCnt, @url, @level, @type, @title, @status, @page, \%uniqueURL, $baseurl, $masterLevel, \%levTwoURLs); $masterLevel++; $linksOnSecondLevel = keys(%levTwoURLs); ####Processing Level Three %levThreeURLs = (); $masterCnt = processSubLevel(3, $masterCnt, @url, @level, @type, @title, @status, @page, \%levTwoURLs, $baseurl, $masterLevel, \%levThreeURLs); $masterLevel++; $linksOnThirdLevel = keys(%levThreeURLs); ####Processing Level Four %levFourURLs = (); $masterCnt = processSubLevel(4, $masterCnt, @url, @level, @type, @title, @status,@page, \%levThreeURLs, $baseurl, $masterLevel, \%levFourURLs); $linksOnFourthLevel = keys(%levFourURLs); printReport(@level,@page,@url,@type,@title,@status, $masterCnt); #### subroutines sub processSubLevel { my ($currentLevel, $mstCnt, $urlArr, $leArr, $tyArr, $tiArr, $stArr, $paArr, $urls, $burl, $mlevel, $uniqueHashRef) = @_; my %urlHash = (); foreach $item (@$urlArr){ $urlHash{$item} = 1; } foreach $lURL (keys %$urls) { if( ($lURL !~ /.gif$/) && ($lURL !~ /.jpg$/) && ($lURL !~ /.png$/) && ($lURL !~ /.pdf$/) && ($lURL !~ /.doc$/) && ($lURL !~ /.xls$/) && ($lURL !~ /.asf$/) && ($lURL !~ /.mov$/) && ($lURL !~ /.avi$/) && ($lURL !~ /.xvid$/) && ($lURL !~ /.flv$/) && ($lURL !~ /.mpg$/) && ($lURL !~ /.3gp$/) && ($lURL !~ /.mp4$/) && ($lURL !~ /.qt$/) && ($lURL !~ /.rm$/) && ($lURL !~ /.swf$/) && ($lURL !~ /.wmv$/) && ($lURL !~ /.txt$/) && ($lURL !~ /.js$/) && ($lURL !~ /.css$/) && ($lURL =~ /$burl/) && ($lURL !~ /mailto/i)&&($lURL !~ /javascript/i) ) { $mech->get( $lURL ); @sublinks = $mech->links(); $cnt2=0; foreach $link (@sublinks) { my $tmpurl =""; #assuming relative link creating temp variable if ( $link->url() !~ /^http/i ) { $tmpurl = $burl . '/' . $link->url(); }else { $tmpurl = $link->url(); } if(!(exists $urlHash{$tmpurl}) ){ if ( ($link->url() !~ /mailto/i) && ($link->url() !~ /javascript/i ) ) { #check UNIQUENESS if( !(exists $urls->{$tmpurl}) ) { $urls->{$tmpurl}=$link->text(); $uniqueHashRef->{ $tmpurl } = $link->text(); } # check if link relative or absolute if ( $link->url() !~ /^http/ ) { ## RELATIVE $urlArr->[$mstCnt]= $tmpurl; $tyArr->[$mstCnt]= "relative internal"; }else { ## ABSOLUTE #adjusting temp variable $urlArr->[$mstCnt]=$link->url(); if( $link->url() =~ /$baseurl/ ){ $tyArr->[$mstCnt]= "absolute internal"; }else { $tyArr->[$mstCnt]= "outbound"; } } $leArr->[$mstCnt]=$mlevel; $tiArr->[$mstCnt]=$link->text(); $paArr->[$mstCnt]=$tmpurl; $mstCnt++; } } } } } return ($mstCnt); } sub printReport { my ($levelArr, $pageArr, $urlArr, $typeArr, $titleArr, $statusArr, $mCnt) = @_; %tmpCleanupHash=(); print "Level Parent Page or Location Unique URL Link Type Title Status Codes"; for($i=0;$i<$mCnt;$i++) { if ( !(exists $tmpCleanupHash{$url[$i]}) ){ $tmpCleanupHash{$url[$i]} = 1; if ($levelArr->[$i] ne "") { print " $levelArr->[$i] $pageArr->[$i] $urlArr->[$i] $typeArr->[$i] $titleArr->[$i] t".getstore($urlArr->[$i], "temp"); } } } }
################################################ # File: mymonitor.pl # # Description: This script takes an argument # # reporesenting a web page url # # Format: perl mymonitor.pl http://www.xyz.com # ################################################ use threads; use Benchmark; use Time::HiRes qw(gettimeofday tv_interval); use LWP::Simple; use LWP::UserAgent; use File::Path; #get page to monitor my $pageToMonitor = shift; my $ua = new LWP::UserAgent; my $res; #cleanup temp files rmtree( './temp', {keep_root => 1} ); # start timer my $start_time = [ gettimeofday ]; $res = $ua->get("$pageToMonitor",':content_file' => "./temp/temp.dat"); # stop timer my $end_time = [ gettimeofday ]; my $elapsedtime = tv_interval($start_time,$end_time); ##### CREATING DATA FILES ##################################### my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime time; $year += 1900; $mon++; # Create today.txt open OUTPTR, ">>./report/today/today.txt"; print OUTPTR "$mday-$mon-$year $hour:$min:$sec;10;15;20; $elapsedtime "; close OUTPTR; # Create month.txt open OUTPTR, ">>./report/month/month.txt"; print OUTPTR "$mday-$mon-$year $hour:$min:$sec;10;15;20; $elapsedtime "; close OUTPTR; # Create year.txt open OUTPTR, ">>./report/year/year.txt"; print OUTPTR "$mday-$mon-$year $hour:$min:$sec;10;15;20; $elapsedtime "; close OUTPTR; # Create historical.txt open OUTPTR, ">>./report/historical/historical.txt"; print OUTPTR "$mday-$mon-$year $hour:$min:$sec;10;15;20;$elapsedtime "; close OUTPTR;
#!/usr/local/bin/perl ########################################################### # File: inlinksAnalysis.pl # # Description: This script performs analysis on Yahoo! # # inbound links TSD file # ########################################################### use LWP::Simple; use LWP::UserAgent; use HTML::TokeParser; my @URLs = (); #get the input param name of the file my $fileToProcess = $ARGV[0]; my $baseurl = $ARGV[1]; print " Processing: $fileToProcess"; my $cnt = 0; # open the file if (-e "$fileToProcess"){ open FILE, "$fileToProcess" or die $!; while (<FILE>) { my $line = $_; my @fragments = split(/ /, $line); my $url = $fragments[1]; $URLs[$cnt] = $url; $cnt++; } } else { print " file ($fileToProcess) does not exist"; } my $ua = new LWP::UserAgent; my $res; $ua->agent("My Crawler"); my %linkPopHash = (); my %anchorPopHash = (); for(my $i=0; $i<=$cnt; $i++) { $res = $ua->get("$URLs[$i]",':content_file' => "temp.txt"); if (-e "temp.txt") { my $p = HTML::TokeParser->new("temp.txt"); while (my $token = $p->get_tag("a")) { #get link and anchor text my $url = $token->[1]{href} || "-"; my $anchorText = $p->get_trimmed_text("/a"); $url =~ s/^s+//g; $url =~ s/s+$//g; my $text = $p->get_trimmed_text("/a"); if ($url =~ /$baseurl/i) { #print " $baseurl URL: $URLs[$i] LINK: $url"; if(exists $linkPopHash{$url}){ $linkPopHash{$url} = $linkPopHash{$url} + 1; $anchorPopHash{$url} = $anchorText; } else { $linkPopHash{$url} = 1; $anchorPopHash{$url} = $anchorText; } } } } } open (FP, '>report.txt'), foreach my $key ( sort { $linkPopHash{$b} <=> $linkPopHash{$a} } keys %linkPopHash ) { print FP "$key, $linkPopHash{$key}, "$anchorPopHash{$key}" "; } close (FP);
#!/usr/bin/perl
#----------------------------------#
# PROGRAM: Search Phrase Report #
#----------------------------------#
$numArgs = $#ARGV + 1;
%googleDirCnt = ();
foreach $argnum (0 .. $#ARGV) {
print "Processing $ARGV[$argnum] file
";
$LOGFILE = "$ARGV[$argnum]";
open(LOGFILE) or die("Could not open log file: $ARGV[$argnum].");
foreach $line (<LOGFILE>) {
#do Google analysis
if(($line =~ /q=/) && ($line =~ /google/)) {
@tmp1 = split ('GET ',$line);
@tmp2 = split (' ', $tmp1[1]);
@tmp3 = split ('q=', $tmp1[1]);
@tmp4 = split ('&', $tmp3[1]);
#do some cleanup
$tmp4[0] =~ s/+/ /;
$tmp4[0] =~ s/\%20/ /g;
$tmp4[0] =~ s/\%3C/</gi;
$tmp4[0] =~ s/\%3E/>/gi;
$tmp4[0] =~ s/\%23/#/g;
$tmp4[0] =~ s/\%22/"/g;
$tmp4[0] =~ s/\%25/\%/g;
$tmp4[0] =~ s/\%3A/:/gi;
$tmp4[0] =~ s/\%2F///gi;
$tmp4[0] =~ s/\%2B/+/gi;
@tmp5 = split ('"', $tmp4[0]);
$tmpKey = "<tr><td>".$tmp2[0]." </td><td>".$tmp5[0]."</td>";
$googleDirCnt{$tmpKey} = $googleDirCnt{$tmpKey} +1;
}
}
close(LOGFILE);
}
open (FP, '>keywordsummary.html'),
print FP "<html><head><title>Keyword Summary</title><head>";
print FP "<body><strong>Google Summary</strong>";
print FP "<table width=400><tr><td><b>Resource/URL</b></td><td><b>Keyword</b></td>";
print FP "<td><b>Count</b></td><tr>";
foreach $key (sort hashValueDescendingNum (keys(%googleDirCnt))) {
print FP $key."<td>".$googleDirCnt{$key}."</td></tr>";
}
print FP "</table></body></html>";
close (FP);
sub hashValueDescendingNum {
$googleDirCnt{$b} <=> $googleDirCnt{$a};
}
#!/usr/local/bin/perl ########################################### # File: getRankings.pl # # Description: This script queries SEs # # to produce rankings report # ########################################### ### Basic setup part $numOfArgs = $#ARGV + 1; $originalkeywordphrase = ""; $targeturl=""; if ( ($numOfArgs == 0) || ($numOfArgs == 1) || ($numOfArgs < 0)) { print (" Usage: perl getRanking.pl [TargetURL] [Keyword] "); print (" OR "); print (" Usage: perl getRanking.pl [TargetURL] [Keyword1] [Keyword2] ... [KeywordN] "); exit(0); } $targeturl=$ARGV[0]; if ( $numOfArgs == 2){ $originalkeywordphrase = $ARGV[1]; }else { foreach $argnum (1 .. $#ARGV) { $originalkeywordphrase = $originalkeywordphrase . " " . $ARGV[$argnum]; #remove leading & trailing spaces $originalkeywordphrase =~ s/^s+//; $originalkeywordphrase =~ s/s+$//; } } $keywordphrase= $originalkeywordphrase; $keywordphrase =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; # define Source Urls $listingNo=100; $gurl= "http://www.google.com/search?num=$listingNo&hl=en&safe=off&q=$keywordphrase&sa=N"; $burl= "http://www.bing.com/search?q=$keywordphrase&first=1&count=100&"; ### get SERP pages part # get google SERP $gserp = `wget "$gurl" --user-agent="Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" --output-document="gserp.html" --cookies=off`; # get Bing SERP $bserp = `wget "$burl" --user-agent="Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" --output-document="bserp.html" --cookies=off`; ### analysis part $googlePositionNumber = getPosition ($targeturl, "google"); $bingSearchPositionNumber = getPosition ($targeturl, "bing"); # report part ########################## print " Ranking Summary Report "; print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ "; print "Keyword/Phrase: $originalkeywordphrase "; print "Target URL: $targeturl "; print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ "; print " Google.....: $googlePositionNumber "; if($bingSearchPositionNumber ne "not found"){ $cntAdjusted = $bingSearchPositionNumber + 1; print " Bing Search: $cntAdjusted "; }else{ print " Bing Search: $bingSearchPositionNumber "; } print " Note: Check with specific SE to ensure correctness. "; ##### SUBROUTINES #################################### sub getContent { $filename=shift; open INPUT, "<$filename"; undef $/; $content = <INPUT>; close INPUT; #Restore behaviour $/ = " "; #substitute new line character with space character $content =~ s/ / /g; #substitute quotes with nothing $content =~ s/"//g; #cleanup bing $content =~ s/<strong>//g; $content =~ s/</strong>//g; $content =~ s/<cite>//g; $content =~ s/</cite>//g; return $content; } sub getPosition { $targeturl= shift; $se = shift; @tokens = (); $offset = 0; if($se eq "google") { $gcontent = getContent("gserp.html"); @tokens = split(/h3 class=r/, $gcontent); } elsif($se eq "bing") { $bcontent = getContent("bserp.html"); @tokens = split(/sa_cc/, $bcontent); $offset=2; } $mastercnt = "not found"; $cnt=0; $foundFlag = "no"; print "number of tokens:". $#tokens; foreach $token (@tokens) { #print " token: $token"; if ($token =~ /$targeturl/gi) { if($foundFlag eq "no") { $mastercnt = $cnt - $offset; } else { $mastercnt = "" . $mastercnt . "," . $cnt; } #print " MATCH: $targeturl cnt: $cnt $mastercnt token"; #got a match return back position number $foundFlag = "yes"; } $cnt = $cnt + 1; } return $mastercnt; }
CREATE TABLE `mytest`.`queue` ( `id` INT( 6 ) NOT NULL AUTO_INCREMENT PRIMARY KEY , `message` TEXT NOT NULL , `status` INT( 1 ) NOT NULL DEFAULT '0' ) ENGINE = MYISAM ;
<? # change all lines but the last line (Twitter status update link) #database $username="your-db-username"; $password="your-db-password"; $database="your-database-name"; #twitter $tusrid = 'your-twitter-userid'; $tpasswd = 'your-twitter-password'; $tURL = 'http://twitter.com/statuses/update.xml'; ?>
<html> <head> <title>Home</title> <script> function limitText(limitField, limitNum) { if (limitField.value.length > limitNum) { limitField.value = limitField.value.substring(0, limitNum); } } </script> </head> <body> <h3> What will you be doing? <br>(or what do you want others to think you are doing) </h3><br> <form name=mainform method=post action=add.php onSubmit="return checkLength(this)"> <textarea name="message" rows="3" cols="80" onKeyDown="limitText(this,140);" onKeyUp="limitText(this,140);"> </textarea> <br> <input type=submit value='Add Future Tweet'> </form> <br> <?php include("config.php"); mysql_connect(localhost,$username,$password); @mysql_select_db($database) or die( "Unable to select my database"); $query="SELECT * FROM queue where status=0 order by id desc"; $result=mysql_query($query); $numOfRecords=mysql_numrows($result); mysql_close(); echo "<b>My Future Tweets</center></b><br><hr>"; ?> <table border="1" cellspacing="2" cellpadding="2"> <tr> <td><b>id</b></td> <td><b>Tweet</b></td> <td><b>Status</b></td> </tr> <? $i=0; while ($i < $numOfRecords) { $id=mysql_result($result,$i,"id"); $message=mysql_result($result,$i,"message"); $status=mysql_result($result,$i,"status"); ?> <tr> <td nowrap><? echo "$id"; ?></td> <td width=350><? echo "$message"; ?> <?php $tmp = ""; if ($status < 1) { $tmp = "Not Sent"; } ?> <a href="delete.php?id=<?php echo $id ?>">Delete</a> </td><td nowarp><? echo "$tmp"; ?></td> </tr> <? $i=$i+1; } ?> </table> </body> </html>
<? include("config.php"); mysql_connect(localhost,$username,$password); @mysql_select_db($database) or die( "Unable to select my database"); $message = $_POST['message']; $query = "INSERT INTO queue (message) VALUES ('$message')"; mysql_query($query); mysql_close(); ?> <script> alert('Tweet Added'), window.location.href = "index.php"; </script>
<? include("config.php"); mysql_connect(localhost,$username,$password); @mysql_select_db($database) or die( "Unable to select my database"); $myid = $_GET['id']; $query="UPDATE queue SET status=2 WHERE id=$myid"; mysql_query($query); mysql_close(); ?> <script> alert('Tweet Removed'), window.location.href = "index.php"; </script>
<?php include("config.php"); mysql_connect(localhost,$username,$password); @mysql_select_db($database) or die( "Unable to select my database"); ### get the tweet $result = mysql_query("select id, message from queue where status=0 order by id asc LIMIT 1"); $row = mysql_fetch_array($result); ### send the tweet $curl_handle = curl_init(); curl_setopt($curl_handle, CURLOPT_URL, "$tURL"); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl_handle, CURLOPT_POST, 1); $message = $row['message']; curl_setopt($curl_handle, CURLOPT_POSTFIELDS, "status=$message"); curl_setopt($curl_handle, CURLOPT_USERPWD, "$tusrid:$tpasswd"); $response = curl_exec($curl_handle); curl_close($curl_handle); // get the status message if (empty($response)) { echo 'tweet not delivered'; } else { echo 'tweet delivered'; ###update db status $mid = $row['id']; mysql_query("UPDATE queue SET status = 1 WHERE id = $mid"); } mysql_close(); ?>
The following code listings represent only the main listings. For the full source code please visit book.seowarrior.com.
<html> <head> <title>SEO Warrior: Keyboard Dashboard (Alfa)</title> <link rel="stylesheet" type="text/css" href="pagestyle.css" /> <script src="functions.js" type="text/javascript"></script> <script src="dockablewindow.js" type="text/javascript"></script> </head> <body> <table width=100% cellpadding=0 cellspacing=0 border=0> <tr> <td valign=top align=left><h1 style='color=blue'>SEO Warrior: Keyword Dashboard (Alpha) </h1> </td> <td valign=top align=right> <img border=0 src="http://www.seowarrior.net/images/status.png" title="SEO Warrior: Keyword Dashboard Status"> <a href="http://www.seowarrior.net/contact/" title="Report Bugs"> <font size=2>Report Bugs</font></a> | <a href="http://www.seowarrior.net/contact/" title="Make a Suggestion"> <font size=2>Suggestion</font></a> <a href="http://www.seowarrior.net"><img border=0 src="http://www.seowarrior.net/images/seowarriormini.png" title="SEO Warrior: Keyword Dashboard"></a> </td> </tr> </table> <div id="formdiv"> <form name="mainform" onSubmit="return false;"> Keyword: <input type="text" id="keyword" name="keyword" size="20"> <input type="button" id="phaseGoogleBtn" name="phaseGoogleBtn" value="Google" onclick="stepOne('google')"> <input type="button" id="phaseBingBtn" name="phaseBingBtn" value="Bing" onclick="stepOne('bing')"> <input type="button" id="phaseYahooBtn" name="phaseYahooBtn" value="Yahoo!" onclick="stepOne('yahoo')"> <input type="radio" name="resultLimit" value="10" checked >10 <input type="radio" name="resultLimit" value="20">20 <input type="radio" name="resultLimit" value="50">50 [Results] </form> </div> <iframe onLoad="resizeG()" name="responsedivgoogle" id="responsedivgoogle" scrolling="no"></iframe> <iframe onLoad="resizeY()" name="responsedivyahoo" id="responsedivyahoo" scrolling="no"></iframe> <iframe onLoad="resizeB()" name="responsedivbing" id="responsedivbing" scrolling="no"></iframe> <iframe name="detailsframe" id="detailsframe" class="dockclass"></iframe> <script type="text/javascript"> var dock0=new dockit("detailsframe", 0); </script> </body> </html>
<html> <head> <style> body { font-weight : normal; font-size : 12px; font-family : helvetica; text-decoration : bold; background : #f3f3f3; } a:hover { font-weight : normal; font-size : 12px; font-family : helvetica; background : #989898; text-decoration : bold; } a:visited, a:link, a:active { font-weight : normal; font-size : 12px; font-family : helvetica; color : #000022; text-decoration : normal; } </style> </head> <body> <b>Bing</b> <br>Keyword: <?=$_GET["keyword"]?> <br>Showing</b> <?=$_GET["resultLimit"]?> results <br><hr> <? function getBaseURL($url){ list($part1, $part2) = split("://", $url); list($part3, $part4) = split("/", $part2); # $baseurl = $part1 . "://" . $part3; $baseurl = $part3; return $baseurl; } function getBingSERP($mykeyword, $myindex){ $reg_ex = "[[:space:]]"; $replace_word = "+"; $str = $mykeyword; $mykeyword = ereg_replace($reg_ex, $replace_word, $str); $url = "http://www.bing.com/search?q=".$mykeyword."&first=".$myindex."&"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_COOKIEFILE, "c:cookie.txt"); $client = $_SERVER['HTTP_USER_AGENT']; curl_setopt($ch, CURLOPT_USERAGENT, "$client"); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $output = curl_exec($ch); curl_close($ch); return $output; } function processSERP($serp, $masterCnt, $rowLimit) { $dom = new DOMDocument(); @$dom->loadHTML($serp); $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); $sofar = ""; for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'), $baseurl = getBaseURL($url); $urlChunks = spliti (" ", $_GET["keyword"]); foreach ($urlChunks as $chunk) { $highChunk = '<B>'.$chunk.'</B>'; $baseurl = str_replace("$chunk", "$highChunk", $baseurl); } $anchortext = $href->nodeValue; if ( (preg_match("/live.com/i", "$url")) || (preg_match("/msn.c/i", "$url")) || (preg_match("/microsoft.com/i", "$url")) ) { }else { if (preg_match("/^http/i", "$url") || preg_match("/^ftp/i", "$url")) { if (strpos($sofar, $baseurl) !== false) { } else { if($masterCnt < $rowLimit){ ?> <a target=detailsframe href='kw.php?url=<?=$url?>&keyword=<?=$_GET['keyword']?>' title='<?=$anchortext?>'><?=$baseurl?></a><br><? $masterCnt++; } } } } $sofar = $sofar . $baseurl; } return $masterCnt; } $rowLimit = $_GET["resultLimit"]; $masterCnt = 0; $next = 1; $keyword = $_GET["keyword"]; $serpRes = getBingSERP($keyword, $next); $masterCnt = processSERP($serpRes, $masterCnt, $rowLimit); flush(); if($masterCnt<$rowLimit) { sleep(rand(1, 3)); $next = $first+10; sleep(rand(2, 6)); $masterCnt = processSERP($serpRes, $masterCnt, $rowLimit); flush(); } if($masterCnt<$rowLimit) { $next = $next+10; sleep(rand(1, 3)); $serpRes = getBingSERP($keyword, $next); $masterCnt = processSERP($serpRes, $masterCnt, $rowLimit); flush(); } if($masterCnt<$rowLimit) { $next = $next+10; sleep(rand(1, 3)); $serpRes = getBingSERP($keyword, $next); $masterCnt = processSERP($serpRes, $masterCnt, $rowLimit); flush(); } if($masterCnt<$rowLimit) { $nextRes = $next+10; sleep(rand(1, 3)); $serpRes = getBingSERP($keyword, $next); $masterCnt = processSERP($serpRes, $masterCnt, $rowLimit); } ?> </body> </html>
<html> <head> <style> body { font-weight : normal; font-size : 12px; font-family : helvetica; text-decoration : bold; background : #f3f3f3; } a:hover { font-weight : normal; font-size : 12px; font-family : helvetica; background : #989898; text-decoration : bold; } a:visited, a:link, a:active { font-weight : normal; font-size : 12px; font-family : helvetica; color : #000022; text-decoration : normal; } </style> </head> <body> <b>Google</b> <br>Keyword: <?=$_GET["keyword"]?> <br>Showing</b> <?=$_GET["resultLimit"]?> results <br><hr> <? function getBaseURL($url){ list($part1, $part2) = split("://", $url); list($part3, $part4) = split("/", $part2); # $baseurl = $part1 . "://" . $part3; $baseurl = $part3; return $baseurl; } function getGoogleSERP($mykeyword){ $reg_ex = "[[:space:]]"; $replace_word = "+"; $str = $mykeyword; $mykeyword = ereg_replace($reg_ex, $replace_word, $str); $url = "http://www.google.com/search?q=".$mykeyword.".&num=50&"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); $client = $_SERVER['HTTP_USER_AGENT']; curl_setopt($ch, CURLOPT_USERAGENT, "$client"); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $output = curl_exec($ch); curl_close($ch); return $output; } $rowLimit = $_GET["resultLimit"]; $keyword = $_GET["keyword"]; $serp = getGoogleSERP($keyword); $dom = new DOMDocument(); @$dom->loadHTML($serp); $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); $sofar = ""; $intCnt = 0; for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'), $baseurl = getBaseURL($url); $anchortext = $href->nodeValue; $urlChunks = spliti (" ", $keyword); foreach ($urlChunks as $chunk) { $highChunk = '<B>'.$chunk.'</B>'; $baseurl = str_replace("$chunk", "$highChunk", $baseurl); } if ( (preg_match("/google.com/i", "$url")) || (preg_match("/youtube.com/i", "$url")) || (preg_match("/^//i", "$url")) || (preg_match("/cache:/i", "$url")) ) { }else { if (preg_match("/^http/i", "$url") || preg_match("/^ftp/i", "$url")) { if (preg_match("/^http/i", "$url") || preg_match("/^ftp/i", "$url")) { if (strpos($sofar, $baseurl) !== false) { } else { if($intCnt < $rowLimit) { ?> <a target=detailsframe href='kw.php?url=<?=$url?>&keyword=<?=$_GET['keyword']?>' title='<?=$anchortext?>'><?=$baseurl?></a><br><? $intCnt++; } } } } } $sofar = $sofar . $baseurl; } ?> </body> </html>
<html> <head> <style> body { font-weight : normal; font-size : 12px; font-family : helvetica; text-decoration : bold; background : #f3f3f3; } a:hover { font-weight : normal; font-size : 12px; font-family : helvetica; background : #989898; text-decoration : bold; } a:visited, a:link, a:active { font-weight : normal; font-size : 12px; font-family : helvetica; color : #000022; text-decoration : normal; } </style> </head> <body> <b>Yahoo!</b> <br>Keyword: <?=$_GET["keyword"]?> <br>Showing</b> <?=$_GET["resultLimit"]?> results <br><hr> <? function getBaseURL($url){ list($part1, $part2) = split("://", $url); list($part3, $part4) = split("/", $part2); # $baseurl = $part1 . "://" . $part3; $baseurl = $part3; return $baseurl; } function getYahooSERP($mykeyword){ $reg_ex = "[[:space:]]"; $replace_word = "+"; $str = $mykeyword; $mykeyword = ereg_replace($reg_ex, $replace_word, $str); $url = "http://search.yahoo.com/search;_ylt=?p=".$mykeyword.".&n=100&"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_REFERER, "http://search.yahoo.com/"); $client = $_SERVER['HTTP_USER_AGENT']; curl_setopt($ch, CURLOPT_USERAGENT, "$client"); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $output = curl_exec($ch); curl_close($ch); return $output; } $rowLimit = $_GET["resultLimit"]; $keyword = $_GET["keyword"]; $serp = getYahooSERP($keyword); $dom = new DOMDocument(); @$dom->loadHTML($serp); $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); $sofar = ""; $intCnt = 0; for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'), $tmpurl = ""; list($tmp1, $tmpurl) = split('**', $url, 2); $tmpurl = urldecode($tmpurl); $baseurl = getBaseURL($tmpurl); $urlChunks = spliti (" ", $keyword); foreach ($urlChunks as $chunk) { $highChunk = '<B>'.$chunk.'</B>'; $baseurl = str_replace("$chunk", "$highChunk", $baseurl); } $anchor = $href->getAttribute('title'), $anchortext = $href->nodeValue; if ( preg_match("/**/i", "$url") ) { if ( preg_match("/yahoo.com/i", "$baseurl") || preg_match("/cache/i", "$url") ) { } else { if (preg_match("/^http/i", "$url") || preg_match("/^ftp/i", "$url")) { if (strpos($sofar, $baseurl) !== false) { } else { if($intCnt < $rowLimit) { ?> <a target=detailsframe href='kw.php?url=<?=$tmpurl?>&keyword=<?=$_GET['keyword']?>' title='<?=$anchortext?>'><?=$baseurl?></a><br><? $intCnt++; } } } } } $sofar = $sofar . $baseurl; } ?> </body> </html>
3.147.53.119