PHP Google XML Sitemap Bot Class
POSTED AT 15:35 on 23rd May 2008
This bot will crawl any site and produce a valid XML Google sitemap used for Google sitemaps within the Webmaster tools section of their site.
Simply provide the domain name that you wish to crawl and this PHP class will crawl your site by finding all links and produce a valid XML sitemap.
If you would like a sitemap produced for you, but don’t know how to run this script then my company frozensheep offers this as a service - just get in touch with us.
Source Code
- <?php
- /*
- *
- * Google Sitemap Bot
- *
- * Class to deal with creating an automatic google sitemap.
- * Will create a valid google XML sitemap from any base URL
- *
- * Written By Jacob Wyke - jacob@redvodkajelly.com - www.redvodkajelly.com
- *
- * LICENSE
- * ——-
- * Feel free to use this as you wish, just give me credit where credits due and drop me an email telling me what your using it for so I can check out all the cool ways its been used.
- *
- * USAGE
- * —–
- * To use simply call the class with the desired URL.
- *
- * $objBot = new RVJ_googleMapBot("http://www.redvodkajelly.com");
- *
- * $objBot->saveSiteMap("sitemap.xml");
- *
- * Your sitemap will then be saved with any page on the domain found through crawling the site.
- */
- set_time_limit(0);
- class RVJ_googleMapBot {
- var $strURL;
- var $arrURLs = array();
- var $arrCheckedURLs = array();
- var $arrAllowedExtensions = array(’html’, ‘htm’, ‘php’, ‘php3′, ‘asp’, ‘asx’, ‘xml’, ‘cfc’);
-
- /*
- *
- * @Method: RVJ_googleMapBot
- * @Parameters: 1
- * @Param-1: strURL - String - The URL yowant to crawl
- * @Description: Runs the bot to crawl the requested site
- *
- */
- function RVJ_googleMapBot($strURL){
- //store the passed details
- $this->strURL = $strURL;
- $this->arrURLs[] = $this->strURL;
- $this->numLevelsDeep = $numLevelsDeep;
-
- $this->parseSite();
- }
- /*
- *
- * @Method:
- * @Parameters: 0
- * @Description:
- *
- */
- function parseSite(){
- foreach($this->arrURLs as $strURL){
- //check to make sure we havent already parsed this page
- if(!in_array($strURL, $this->arrCheckedURLs)){
- //ensure that the URL is for the same site
- if(eregi($this->strURL, $strURL)){
- //get the page content
- $strPage = $this->getPage($strURL);
-
- //thanks to http://www.sphider.eu/ for the following regex
- preg_match_all("/(href)\s*=\s*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/\?~=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $strPage, $arrMatches);
- for($numX=0;$numX<count($arrMatches[0]);$numX++){
- $this->addURL($arrMatches[2][$numX]);
- }
- }
- //dump the page from memory or else PHP will complain its using too much memory
- unset($strPage);
- //add this URL to the checked list
- $this->arrCheckedURLs[] = $strURL;
- //remove any possible dupliate URLs
- $this->arrURLs = array_unique($this->arrURLs);
- //run this method again
- $this->parseSite();
- }
- }
- }
- /*
- *
- * @Method:
- * @Parameters: 0
- * @Description:
- *
- */
- function getPage($strURL){
- echo "Reading…$strURL<br/>\n";
- if($strContent = @file_get_contents($strURL)){
- return $strContent;
- }else{
- //remove this URL from the list
- $numKey = array_search($strURL, $this->arrURLs);
- unset($this->arrURLs[$numKey]);
- }
- }
- /*
- *
- * @Method:
- * @Parameters: 0
- * @Description:
- *
- */
- function addURL($strURL){
- //ensure that the link is to a full url
- if(substr($strURL, 0, 1) == ‘/’){
- //its an internal link
- $strURL = $this->strURL.$strURL;
- }
- if(substr($strURL, 0, strlen($this->strURL))!=$this->strURL AND substr($strURL, 0, 4)!=’http’){
- //its an internal link
- $strURL = $this->strURL."/".$strURL;
- }
- //if its an internal link on the same URL add it to out list
- if(substr($strURL, 0, strlen($this->strURL))==$this->strURL){
- if($this->checkURL($strURL)){
- $this->arrURLs[] = $strURL;
- }
- }
- }
- /*
- *
- * @Method:
- * @Parameters: 0
- * @Description:
- *
- */
- function checkURL($strURL){
- //get the file extension if there is one
- $arrExt = explode(".", $strURL);
- $arrExt = array_reverse($arrExt);
- $strExt = $arrExt[0];
- if(strlen($strExt)<=4){
- //check the string extension against allowed types
- if(!in_array($strExt, $this->arrAllowedExtensions)){
- return 0;
- }
- }
- return 1;
- }
- /*
- *
- * @Method:
- * @Parameters: 0
- * @Description:
- *
- */
- function displaySiteMap(){
- $strSitemap = ‘<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.google.com/schemas/sitemap/0.84">’;
- foreach($this->arrURLs as $strURL){
- $strSitemap .= ‘
- <url>
- <loc>’.$strURL.’</loc>
- <changefreq>weekly</changefreq>
- </url>
- ’;
- }
- $strSitemap .= ‘</urlset>’;
- return $strSitemap;
- }
- /*
- *
- * @Method:
- * @Parameters: 0
- * @Description:
- *
- */
- function saveSiteMap($strFile = "sitemap.xml"){
- $resFP = fopen($strFile, ‘w+’);
- if($resFP){
- fwrite($resFP, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\r\n");
- foreach($this->arrURLs as $strURL){
- fwrite($resFP, "<url>\r\n");
- fwrite($resFP, "<loc>{$strURL}</loc>\r\n");
- fwrite($resFP, "<changefreq>weekly</changefreq>\r\n");
- fwrite($resFP, "</url>\r\n");
- }
- fwrite($resFP, "</urlset>");
- fclose($resFP);
- }
- }
-
- }
- ?>



