User:MonoBot/Source

From Wikispecies
Jump to navigation Jump to search
import wikipedia, pagegenerators, re, urllib, urllib2

"""
This is a python script written using the pywikipediabot framework. You can find out more about pywikipedia at http://meta.wikimedia.org/wiki/pywikipediabot

This script automatically creates articles after queries ITIS for information. 

Released under Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License

(C) Monobi 2008

"""

def getsource(pagereq):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    header = {'User-Agent': user_agent}
    req = urllib2.Request(pagereq, None, header)
    response = urllib2.urlopen(req)
    page = response.read()
    return page

def main():
    #########################
    #    1 = on, 0 = off    #
    #########################
    on = 0

    ############################################
    # Various regexes for matching needed data #
    ############################################
    founder_species = re.compile("&nbsp;<B>(?P<author>.*?)</B>", re.I)
    vn_es_template = re.compile("width=\"(.*?)\W\">(?P<spanish>.*?)\[Spanish\]", re.I)
    vn_en_template = re.compile("width=\"(.*?)\W\">(?P<english>.*?)\[English\]", re.I)
    genus_template = re.compile("Genus</TD><TD class=datafield vAlign=top width=\"71%\"><A HREF=\"SingleRpt\?search_topic=TSN&search_value=.*\">(?P<template>.*?)</A>", re.I)
    subspecies_template = re.compile("Subspecies</TD><TD class=datafield vAlign=top width=\"71%\"><A HREF=\"SingleRpt\?search_topic=TSN&search_value=(.*?)\">(?P<subspeciesname>.*?)</A>", re.I)
    serial_number = re.compile("Taxonomic Serial No\.\: (?P<ID>.*?)</SPAN>", re.I)
    no_records = re.compile("No Records Found", re.I)
    article_match = re.compile(r"\[\[(.*?)\]\]", re.I)

    ##############################################################
    # Where the bot downloads the list of articles to be created #
    ##############################################################
    pagelist = "speciespages.txt"

    ######################
    # Site (wikispecies) #
    ######################
    site = wikipedia.getSite('species', 'species')

    ###################################
    # Location of pages to be created #
    ###################################
    
    to = wikipedia.Page(site, 'User:MonoBot/Requested Articles')
    totext = to.get()
    open(pagelist, 'w').write(totext)
    for line in open(pagelist,'r'):
        prelim_article = article_match.findall(line)
        for article_name in prelim_article:
            try:
                ###################################################################
                # Spanish and English {{VN}} template, None (no match) by default #
                ###################################################################
                spanish_vn = None
                english_vn = None
                founder = None

                ##############################
                # Send query to ITIS website #
                ##############################
                article_name = article_name.replace(' ', '+')
                data_page = getsource("http://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=all&search_kingdom=every&search_span=exactly_for&search_value=%s&categories=All&source=html&search_credRating=All&Go=Search" % article_name)

                ##########################
                # Get the genus template #
                ##########################
                if not re.search(no_records, data_page, re.I):
                    template_getter = re.search(genus_template, data_page)
                    if template_getter:
                        my_template = template_getter.group('template')
                    
                    ##########################
                    # Get the ITIS ID number #
                    ##########################
                
                    id_getter = re.search(serial_number, data_page)
                    if id_getter:
                        my_id = id_getter.group('ID')

                    ############################
                    # Working on subspecies... #
                    ############################
                
                    subspecies_list = re.search(subspecies_template, data_page)
                    if subspecies_list:
                        subspecies_individual = subspecies_list.group('subspeciesname')
                        subspecies_individual = ''.join(subspecies_list)

                    ################################
                    # Get the |es= part for {{VN}} #
                    ################################
                    get_spanish_vn = re.search(vn_es_template, data_page)
                    if get_spanish_vn:
                        spanish_vn = get_spanish_vn.group('spanish')
                        spanish_vn_list = spanish_vn.split()
                        spanish_vn = spanish_vn_list[0].title() + " " + spanish_vn_list[1]

                    ################################
                    # Get the |en= part for {{VN}} #
                    ################################
                    get_english_vn = re.search(vn_en_template, data_page)
                    if get_english_vn:
                        english_vn = get_english_vn.group('english')
                        english_vn_list = english_vn.split()
                        english_vn = english_vn_list[0].title() + " " + english_vn_list[1]
                    
                    ############################
                    # Get the authors and date #
                    ############################
                    get_founder_search = re.search(founder_species, data_page)
                    if get_founder_search:
                        founder = get_founder_search.group('author')
                        founder = founder.split()
                        final = ''
                        for founders in founder:
                            final += u" [[%s]]" % founders

                        final = final.replace("[[and]]", "&")
                        final = final.replace("[[in]]", "in ")
                        final = final.replace("(", "")
                        final = final.replace(")", "")
                        final = final.replace(",", "")
                        #final = re.sub(r"\[\[(\d{4})\]\]", r"\1", final) <--- first
                        final = re.sub(r"\[\[(\d{4})\]\]", r"\1", final)
                        final = final.replace(" [[", "[[")
                        final = final.replace("&[[", "& [[")
                        final = final.replace("[[van]]", "van")
                        # final = re.sub(r"\]\]\s \d{4}", r"\]\]\s, \1", final)
                        final = re.sub('\]\] (\d{4})', r']], \1', final)
                        final = final.replace("]][[", "]] [[")
                        founder = final
                        #wikipedia.output(founder)
                        

                ####################################################
                # Deal with different regexes if no match is found #
                ####################################################
                
                if spanish_vn == None:
                    spanish_vn = str(spanish_vn)
                    spanish_vn = spanish_vn.replace("None", "")
                if english_vn == None:
                    english_vn = str(english_vn)
                    english_vn = english_vn.replace("None", "")
                if founder == None:
                    founder = str(founder)
                    founder = founder.replace("None", "")

                ###############################################
                # Change the article_name back to into a page #
                ###############################################
                
                article_name = article_name.replace("+", " ")
                article_name = wikipedia.Page(site, article_name)
                
                ##############################################
                # Define all of the information to be placed #
                ##############################################
                taxonav = "==Taxonavigation==\n{{%s}}\nSpecies: ''[[%s]]''" % (my_template, article_name.titleWithoutNamespace())
                name = "\n\n==Name==\n''{{subst:PAGENAME}}'' (%s)\n\n" % founder
                references = "==References==\n*{{ITIS|%s}}\n\n" % my_id
                vern_names = "==Vernacular names==\n{{VN\n|en=%s\n|es=%s}}" % (english_vn, spanish_vn)
                    

                ###################################################################
                # if the article doesnt exist & es/en item found & it's turned on #
                ###################################################################
                if not article_name.exists() and english_vn or spanish_vn != "" and on == 1:
                    newtext = taxonav + name + references + vern_names
                    wikipedia.output(newtext)
                    article_name.put_async(newtext, 'Automatically creating species page', True)

                ######################################################################
                # if the article doesnt exist & no es/en item found & it's turned on #
                ######################################################################
                if not article_name.exists() and founder == "" and on == 1:
                    newtext2 = taxonav + references
                    wikipedia.output(newtext2)
                    article_name.put_async(newtext2, 'Automatically creating species page', True)
                    
                if not article_name.exists() and founder != "":
                    newtext3 = taxonav + name + references
                    article_name.put_async(newtext3, 'Automatically creating species page', True)
                #####################################
                # if the article exist or it is off #
                #####################################
                if article_name.exists() or on != 1 or on == 0:
                    pass

            except wikipedia.Error:
                wikipedia.output(u"Error on %s" % article_name)
                continue
            
            except UnicodeDecodeError:
                wikipedia.output(u"Error with unicode, skipping...")
                continue

    ###################################################
    # Same page as the list of articles to be created #
    #       This cleans up the created articles       #
    ###################################################
    clean_page = wikipedia.Page(site, 'User:MonoBot/Requested Articles')
    try:
        clean_text = clean_page.get()
        cleaner_text = ":'''Admins''': ''Pleases look at the [<!-- -->[User:MonoBot/Instruction]<!-- -->] page before adding any pages here''\n"
        clean_page.put(cleaner_text, 'Cleaning list of processed articles', True)
    except wikipedia.LockedPage:
        wikipedia.output(u"Can't save page, don't have permission")

    except wikipedia.Error:
        wikipedia.output(u"Error")



    
if __name__ == '__main__':
    try:
        main()
    finally:
        wikipedia.stopme()