RedVodkaJelly Logo

PHP Google XML Sitemap Bot Class

POSTED AT 15:35 on 23rd May 2008

This bot will crawl any site and produce a valid XML Google sitemap used for Google sitemaps within the Webmaster tools section of their site.

Simply provide the domain name that you wish to crawl and this PHP class will crawl your site by finding all links and produce a valid XML sitemap.

If you would like a sitemap produced for you, but don’t know how to run this script then my company frozensheep offers this as a service - just get in touch with us.

Source Code

  1. <?php 
  2. /* 
  3. * Google Sitemap Bot 
  4. * Class to deal with creating an automatic google sitemap. 
  5. * Will create a valid google XML sitemap from any base URL 
  6. *  
  7. * Written By Jacob Wyke - jacob@redvodkajelly.com - www.redvodkajelly.com 
  8. *  
  9. * LICENSE 
  10. * ——- 
  11. * Feel free to use this as you wish, just give me credit where credits due and drop me an email telling me what your using it for so I can check out all the cool ways its been used. 
  12. * USAGE 
  13. * —– 
  14. * To use simply call the class with the desired URL. 
  15. *      $objBot = new RVJ_googleMapBot("http://www.redvodkajelly.com"); 
  16. *      $objBot->saveSiteMap("sitemap.xml"); 
  17. * Your sitemap will then be saved with any page on the domain found through crawling the site. 
  18. */ 
  19.  
  20. set_time_limit(0);

     

  21.  
  22. class RVJ_googleMapBot {
     
  23.  
  24.    var $strURL;
     
  25.    var $arrURLs = array();
     
  26.    var $arrCheckedURLs = array();
     
  27.    var $arrAllowedExtensions = array(’html’, ‘htm’, ‘php’, ‘php3′, ‘asp’, ‘asx’, ‘xml’, ‘cfc’); 
  28.    
     
  29.    /* 
  30.    * 
  31.    *   @Method:      RVJ_googleMapBot 
  32.    *   @Parameters:   1
     
  33.    *   @Param-1:      strURL - String - The URL yowant to crawl 
  34.    *   @Description:   Runs the bot to crawl the requested site 
  35.    * 
  36.    */

     

  37.    function RVJ_googleMapBot($strURL){
     
  38.       //store the passed details
     
  39.       $this->strURL = $strURL;
     
  40.       $this->arrURLs[] = $this->strURL;
     
  41.       $this->numLevelsDeep = $numLevelsDeep;
     
  42.       
     
  43.       $this->parseSite(); 
  44.    }
     
  45.  
  46.    /* 
  47.    * 
  48.    *   @Method:       
  49.    *   @Parameters:   0 
  50.    *   @Description:    
  51.    * 
  52.    */   

     

  53.    function parseSite(){
     
  54.       foreach($this->arrURLs as $strURL){
     
  55.          //check to make sure we havent already parsed this page
     
  56.          if(!in_array($strURL, $this->arrCheckedURLs)){
     
  57.             //ensure that the URL is for the same site
     
  58.             if(eregi($this->strURL, $strURL)){
     
  59.                //get the page content
     
  60.                $strPage = $this->getPage($strURL);
     
  61.                
     
  62.                //thanks to http://www.sphider.eu/ for the following regex 
  63.                preg_match_all("/(href)\s*=\s*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/\?~=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $strPage, $arrMatches); 
  64.                for($numX=0;$numX<count($arrMatches[0]);$numX++){

     

  65.                   $this->addURL($arrMatches[2][$numX]); 
  66.                } 
  67.             }
     
  68.              
  69.             //dump the page from memory or else PHP will complain its using too much memory 
  70.             unset($strPage); 
  71.              
  72.             //add this URL to the checked list 
  73.             $this->arrCheckedURLs[] = $strURL; 
  74.              
  75.             //remove any possible dupliate URLs 
  76.             $this->arrURLs = array_unique($this->arrURLs); 
  77.              
  78.             //run this method again 
  79.             $this->parseSite(); 
  80.          }

     

  81.       }
     
  82.    }
     
  83.  
  84.    /* 
  85.    * 
  86.    *   @Method:       
  87.    *   @Parameters:   0 
  88.    *   @Description:    
  89.    * 
  90.    */      
     
  91.    function getPage($strURL){
     
  92.       echo "Reading…$strURL<br/>\n";

     

  93.       if($strContent = @file_get_contents($strURL)){ 
  94.          return $strContent; 
  95.       }else{ 
  96.          //remove this URL from the list 
  97.          $numKey = array_search($strURL, $this->arrURLs); 
  98.          unset($this->arrURLs[$numKey]); 
  99.           
  100.       } 
  101.    }
     
  102.  
  103.    /* 
  104.    * 
  105.    *   @Method:       
  106.    *   @Parameters:   0 
  107.    *   @Description:    
  108.    * 
  109.    */       
  110.    function addURL($strURL){ 
  111.       //ensure that the link is to a full url 
  112.       if(substr($strURL, 0, 1) == ‘/’){ 
  113.          //its an internal link 
  114.          $strURL = $this->strURL.$strURL; 
  115.       } 
  116.       if(substr($strURL, 0, strlen($this->strURL))!=$this->strURL AND substr($strURL, 0, 4)!=’http’){ 
  117.          //its an internal link 
  118.          $strURL = $this->strURL."/".$strURL; 
  119.       } 
  120.  
  121.       //if its an internal link on the same URL add it to out list 
  122.       if(substr($strURL, 0, strlen($this->strURL))==$this->strURL){ 
  123.          if($this->checkURL($strURL)){ 
  124.             $this->arrURLs[] = $strURL; 
  125.          } 
  126.       } 
  127.    } 
  128.  
  129.    /* 
  130.    * 
  131.    *   @Method:       
  132.    *   @Parameters:   0 
  133.    *   @Description:    
  134.    * 
  135.    */       
  136.    function checkURL($strURL){ 
  137.       //get the file extension if there is one 
  138.       $arrExt = explode(".", $strURL); 
  139.       $arrExt = array_reverse($arrExt); 
  140.       $strExt = $arrExt[0]; 
  141.       if(strlen($strExt)<=4){ 
  142.          //check the string extension against allowed types 
  143.          if(!in_array($strExt, $this->arrAllowedExtensions)){ 
  144.             return 0; 
  145.          } 
  146.       } 
  147.        
  148.       return 1; 
  149.    } 
  150.  
  151.    /* 
  152.    * 
  153.    *   @Method:       
  154.    *   @Parameters:   0 
  155.    *   @Description:    
  156.    * 
  157.    */      

     

  158.    function displaySiteMap(){
     
  159.       $strSitemap = ‘<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.google.com/schemas/sitemap/0.84">’; 
  160.  
  161.       foreach($this->arrURLs as $strURL){ 
  162.          $strSitemap .= ‘ 
  163.           <url> 
  164.              <loc>’.$strURL.’</loc> 
  165.              <changefreq>weekly</changefreq> 
  166.           </url> 
  167.          ’; 
  168.       } 
  169.        
  170.       $strSitemap .= ‘</urlset>’;

     

  171.        
  172.       return $strSitemap; 
  173.    }
     
  174.  
  175.    /* 
  176.    * 
  177.    *   @Method:       
  178.    *   @Parameters:   0 
  179.    *   @Description:    
  180.    * 
  181.    */      
     
  182.    function saveSiteMap($strFile = "sitemap.xml"){

     

  183.       $resFP = fopen($strFile, ‘w+’);
     
  184.       if($resFP){ 
  185.          fwrite($resFP, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\r\n"); 
  186.          foreach($this->arrURLs as $strURL){ 
  187.             fwrite($resFP, "<url>\r\n"); 
  188.             fwrite($resFP, "<loc>{$strURL}</loc>\r\n"); 
  189.             fwrite($resFP, "<changefreq>weekly</changefreq>\r\n"); 
  190.             fwrite($resFP, "</url>\r\n"); 
  191.          } 
  192.          fwrite($resFP, "</urlset>"); 
  193.           
  194.          fclose($resFP); 
  195.       } 
  196.    }

     

  197.    
     
  198. }
     
  199.  
  200. ?>