Get all crawl errors from Google Webmaster Tools


You can only download the top crawl error via Google Webmaster Tools at https://www.google.com/webmasters/tools/

If you want to get all crawl errors, this article is very useful.

At first, please install python and gdata if they don't exist.
$ sudo apt-get install python
$ sudo apt-get install python-pip
$ pip install gdata


then run the command:
$ python gwt_crawlerrors.py

gwt_crawlerrors.py content:
import gdata.webmastertools.service
import gdata.service

try:
 from xml.etree import ElementTree as ET
except ImportError:
 from elementtree import ElementTree
import atom
import getpass

username = raw_input("Username: ")
password =getpass.getpass()
domain =  raw_input("Domain(e.g. uk.queryclick.com): ").replace('.', '%2E')
if  domain.endswith('/'):
   domain = domain[:-1]
domain = 'http%3A%2F%2F'  + domain + '%2F'
url = 'https://www.google.com/webmasters/tools/feeds/%s/crawlissues/' % domain
client = gdata.webmastertools.service.GWebmasterToolsService(
       email=username,
       password=password, source='QueryClickWebmasterToolsPythonExample')

def main():
   print 'Logging in'
   client.ProgrammaticLogin()
   print 'Retrieving crawl errors'
   start_index = 1
   total_results, entries = get_errors(start_index)
   f = open('crawl_errors.csv', 'w')
   save_results_csv(entries, f)
   remainder = total_results - 100
   progress =  'Fetching %s - %s of %s'  % (start_index, start_index + 99, total_results)
   print progress
   while remainder > 0:
       start_index += 100
       total_results, entries = get_errors(start_index)
       progress =  'Fetching %s - %s of %s'  % (start_index, start_index + 99, total_results)
       print progress
       save_results_csv(entries, f)
       remainder -= 100
   f.close()

def get_errors(start_index):
   req_url  =  '%s?start-index=%s&max-results=100' % (url, start_index)
   res_stream = client.request('GET', req_url)
   result  = res_stream.read()
   root = ET.fromstring(result)
   total_results = int(root.findall('{http://a9.com/-/spec/opensearchrss/1.0/}totalResults')[0].text)
   entries = root.findall('{http://www.w3.org/2005/Atom}entry')
   return total_results, entries

def save_results_csv(entries, file):
   for entry in entries:
      file.write('%s, %s, %s, %s, %s\n' % (entry[5].text, entry[6].text, entry[7].text, entry[8].text, entry[9].text))
if  __name__ =='__main__':main()

No comments:

Post a Comment