English Wikipedia @ Freddythechick:WikiProject Yorkshire/Articles/Source

<syntaxhighlight lang="python"> import sys import catlib import wikipedia import codecs

the maximum number of articles per page

MAX = 6000

should we write to file or directly to wikipedia?

DBG = False class Watchlist:

   # the name of the template used to tag articles, e.g., "Numismaticnotice"
   template = ""
   # the name of the project, e.g., "Numismatics"
   project = ""
   # the location of the article list (output) -- without prefix, so for
   # "Wikipedia:WikiProject Numismatics/Articles", use "Articles"
   articleOut = ""

   # a list for all articles
   articles = []
   # a list for all article talk pages
   articlesTalk = []
   # a list for all Wikipedia pages
   wikis = []
   # a list for all Wikipedia talk pages
   wikisTalk = []
   # a list for all templates
   templates = []
   # a list for all template talk pages
   templatesTalk = []
   # a list for all categories
   categories = []
   # a list for all category talk pages
   categoriesTalk = []
   # a list for all images
   images = []
   # a list for all image talk pages
   imagesTalk = []
   # a list for all portals
   portals = []
   # a list for all portal talk pages
   portalsTalk = []

   # certain pages need to be included explicitly (for example, if they share
   # a talk page)
   includePages = []

def __init__(self, template, project, articleOut, includePages = []):

   def __init__(self, project, template, articleOut, includePages = []):
       self.template = template
       self.project = project
       self.articleOut = articleOut
       self.articles = []
       self.articlesTalk = []
       self.wikis = []
       self.wikisTalk = []
       self.templates = []
       self.templatesTalk = []
       self.categories = []
       self.categoriesTalk = []
       self.images = []
       self.imagesTalk = []
       self.portals = []
       self.portalsTalk = []
       self.includePages = includePages
   
   def processPageName (self, name):
       """
       Process one page name, updating the lists as appropriate.
       """
       result = name.split(":",1)
       if len(result) == 1:
           self.articles.append(result[0])
           self.articlesTalk.append("Talk:"+result[0])
       elif result[0] == "Talk":
          self.articles.append(result[1])
          self.articlesTalk.append("Talk:"+result[1])
       elif result[0] == "Wikipedia talk" or \
            result[0] == "Wikipedia":
          self.wikis.append("Wikipedia:"+result[1])
          self.wikisTalk.append("Wikipedia talk:"+result[1])
       elif result[0] == "Template talk" or \
            result[0] == "Template":
          self.templates.append("Template:"+result[1])
          self.templatesTalk.append("Template talk:"+result[1])
       elif result[0] == "Category talk" or \
            result[0] == "Category":
          self.categories.append(":Category:"+result[1])
          self.categoriesTalk.append("Category talk:"+result[1])
       elif result[0] == "Image talk" or \
            result[0] == "Image":
          self.images.append(":Image:"+result[1])
          self.imagesTalk.append("Image talk:"+result[1])
       elif result[0] == "Portal talk" or \
            result[0] == "Portal":
          self.portals.append("Portal:"+result[1])
          self.portalsTalk.append("Portal talk:"+result[1])

   def scanCat (self, catName, recurse):
       cat = catlib.Category(wikipedia.getSite(), catName)
       pages = cat.articles(recurse)
       for page in pages:
           self.processPageName(page.title())
       self.categories.append(":Category:"+catName)
       self.categoriesTalk.append("Category talk:"+catName)

   def removeDuplicatesAndSort (self):
       self.articles = sorted(set(self.articles))
       self.articlesTalk = sorted(set(self.articlesTalk))
       self.wikis = sorted(set(self.wikis))
       self.wikisTalk = sorted(set(self.wikisTalk))
       self.templates = sorted(set(self.templates))
       self.templatesTalk = sorted(set(self.templatesTalk))
       self.categories = sorted(set(self.categories))
       self.categoriesTalk = sorted(set(self.categoriesTalk))
       self.images = sorted(set(self.images))
       self.imagesTalk = sorted(set(self.imagesTalk))
       self.portals = sorted(set(self.portals))
       self.portalsTalk = sorted(set(self.portalsTalk))

   def getTaggedPages (self):
       """
       Get the pages that include templateName
       Add the articles to the appropriate lists
       """
       page = wikipedia.Page(wikipedia.getSite(), "Template:" + self.template)
       refs = page.getReferences(onlyTemplateInclusion=True)
       for page in refs:
           self.processPageName(page.title())

       # include the explicitly named pages
       for page in self.includePages:
           self.processPageName(page)

       # remove duplicates and sort the lists
       self.removeDuplicatesAndSort()

       # organize the categories hierarchically (actually, no -- this takes too
       # much time)
       #self.catText = organizeCategories()

   def getPagesFromCategory (self):
       wikipedia.output(u"Getting from category " + "Category:WikiProject " + self.project + " articles")
       articles = []
       findArticlesInCategory("Category:WikiProject " + self.project + " articles", articles)
       articles = sorted(set(articles))

       for page in articles:
           self.processPageName(page)

       # remove duplicates and sort the lists
       self.removeDuplicatesAndSort()

   def getPagesFromTaggedCategories (self):
       page = wikipedia.Page(wikipedia.getSite(), "Template:" + self.template)
       refs = page.getReferences(onlyTemplateInclusion=True)

       # include the explicitly named pages
       articles = []
       for page in refs:
           result = page.title().split(":")
           if result[0] == "Category talk": # we expect this
               findArticlesInCategory("Category:" + result[1], articles)
               # add the category to the list as well
               articles.append(page.title())
       articles = sorted(set(articles))

       for page in articles:            
           self.processPageName(page)

       # remove duplicates and sort the lists
       self.removeDuplicatesAndSort()

       # organize the categories hierarchically (actually, no -- this takes too
       # much time)
       #self.catText = organizeCategories()

   def writeList (self, taggedPagesFlag):
       """
       write the output to the specified page on Wikipedia
       taggedPagesFlag tells whether we're looking for tagged pages (true)
       or tagged categories (false)
       """

       tagText = ""
       if not taggedPagesFlag:
           tagText = "in categories "

       # the output page, without spaces
       wikipedia.output(u"Preparing output")

       output = self.project.replace(" ", "_") + "/" + \
                self.articleOut.replace(" ", "_")
       
       totalArticles = len(self.articles) + len(self.wikis) + \
                       len(self.templates) + len(self.categories) + \
                       len(self.images) + len(self.portals)

mainText = "

" + \ "

This page is automatically " + \

                  "recreated from time to time. Accordingly, any changes you " + \

"make here will be overwitten. See below for details.

\n\n"

       # double the number of articles because of talk pages
       splitting = totalArticles*2 > MAX
       if splitting:
           mainText += "There are too many articles in this project to list " + \
                       "them all on one page. This article contains the first " + \
                       str(MAX) + " articles and links to other articles which " + \
                       "contain "
       else:
           mainText += "This article contains "
       mainText += "links to all articles, categories, images, portal pages " + \
                   "templates, and project pages " + tagText + "with {{[[Template:" + \
                   self.template + "|" + \
                   self.template + "]]}} on their talk page. It was " + \
                   "generated by " + \
                    "Ganeshbot. Its purpose is to be able to track " + \
                   "the project history using [[Special:Recentchangeslinked/" + \
                   "Wikipedia:WikiProject " + output + \
                   "|related changes]] or [http://tools.wikimedia.de/~interiot/" + \
                   "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \
                   "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \
                   "%3DWikipedia:WikiProject_" + output + \
                   "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist] which " + \
                   "only shows the last change for each article.\n" + \
                   "\n"
       
       mainText += "==Regular content (count: " + str(totalArticles) + ")==\n"

       # the number of articles listed on this page
       count = 0
       # the page number
       pageNo = 1
       # the text for this subpage (if no subpages, will just be on the main
       # page)
       mainText += "===Articles (count: " + str(len(self.articles)) + ")===\n"
       prevChar = firstChar = "Z" #initialize to anything but A
       subText = ""
       # make sure the first batch of articles goes to the main page
       firstBatch = True
       for s in self.articles:
           if s[0] != prevChar:
               subText += "\n\n"
               subText += "====" + s[0] + "====\n"
               prevChar = s[0]
               if count == 0:
                   firstChar = prevChar
           subText += "" + s + " - "
           count += 1
           if count > MAX:
               count = 0
               if firstBatch:
                   firstBatch = False
                   mainText += subText
               else:
                   mainText += "\n"
                   mainText += "====" + \
                                firstChar + " through " + prevChar + "====\n"
                   subText = subText.replace("<range>", firstChar + " through " + \
                                             prevChar)
                   self.writeProjPage(self.articleOut + "/Page" + str(pageNo),
                                      subText)
                   pageNo += 1
               firstChar = prevChar
               subText = "===Articles <range>===\n" + \
                         "====" + prevChar + "====\n"
       if splitting and not firstBatch: 
           mainText += "====" + \
                        firstChar + " through " + prevChar + "====\n"
           subText = subText.replace("<range>", firstChar + " through " + prevChar)
           self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText)
           pageNo += 1
       else:
           mainText += subText

       mainText += "\n\n"

       mainText += "==Talk pages==\n"
       prevChar = firstChar = "Z" #initialize to anything but A
       if splitting:
           subText = "This article contains links to some talk pages " + tagText + \
                     "with {{" + self.template + "}} " + \
                     "on their talk page. It was generated by " + \
                      "Ganeshbot. Its purpose is to be able to track " + \
                     "the project history using [[Special:Recentchangeslinked/" + \
                     "Wikipedia:WikiProject " + output + \
                     "/Page" + str(pageNo) + "|related changes]] or [http://tools.wikimedia.de/~interiot/" + \
                     "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \
                     "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \
                     "%3DWikipedia:WikiProject_" + output + \
                     "/Page" + str(pageNo) + "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist] which " + \
                     "only shows the last change for each article.\n" + \
                     "\n" + \
                     "===Articles <range>===\n"
       else:
           subText = ""

       count = 0
       for s in self.articlesTalk:
           if count == 0:
               firstChar = s.split(":")[1][0]
           subText += "" + s + " - "
           count += 1
           if count > MAX:
               count = 0
               endChar = s.split(":")[1][0]
               mainText += "*" + \
                            firstChar + "-" + endChar + "\n"
               subText = subText.replace("<range>", firstChar + " through " + \
                                         endChar)
               self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText)
               pageNo = pageNo+1
               firstChar = endChar
               subText = "===Articles <range>===\n"
       if splitting:
           endChar = s.split(":")[1][0]
           mainText += "*" + \
                        firstChar + " through " + endChar + "\n"
           subText = subText.replace("<range>", firstChar + " through " + endChar)
           self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText)
           pageNo += 1
       else:
           mainText += subText

       mainText += "\n\n"

       mainText += "===Wikipedia (count: " + str(len(self.wikis)) + ")===\n"
       if splitting:
           subText = "This article contains links to templates, categories, portals, " + \
                     "and images " + tagText + "with {{" + self.template + "}} " + \
                     "on their talk page. It was generated by " + \
                      "WatchlistBot. Its purpose is to be able to track " + \
                     "the project history using [[Special:Recentchangeslinked/" + \
                     "Wikipedia:WikiProject " + output + \
                     "/Page" + str(pageNo) + "|related changes]] or [http://tools.wikimedia.de/~interiot/" + \
                     "cgi-bin/offtoolserver/RC_firstonly?url=http%3A%2F%2Fen.wikipedia.org" + \
                     "%2Fw%2Findex.php%3Ftitle%3DSpecial%3ARecentchangeslinked%26target" + \
                     "%3DWikipedia:WikiProject_" + output + \
                     "/Page" + str(pageNo) + "%26hideminor%3D0%26days%3D7%26limit%3D500 related watchlist] which " + \
                     "only shows the last change for each article.\n" + \
                     "\n" + \
                     "===Wikipedia===\n"
           mainText += "*Wikipedia\n"
       else:
           subText = ""
           
       for s in self.wikis:
           subText += "" + s + " - "
           
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""

       for s in self.wikisTalk:
           subText += "" + s + " - "
           
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""            
           
       mainText += "\n\n"
       mainText += "===Templates (count: " + str(len(self.templates)) + ")===\n"
       if splitting:
           subText += "\n\n===Templates===\n"
           mainText += "*Templates\n"
       for s in self.templates:
           subText += "" + s + " - "
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""

       for s in self.templatesTalk:
           subText += "" + s + " - "
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""            
           
       mainText += "\n\n"
       mainText += "===Categories (count: " + str(len(self.categories)) + ")===\n"
       if splitting:
           subText += "\n\n===Categories===\n"
           mainText += "*Categories\n"
       for s in self.categories:
           subText += "" + s + " - "
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""

       for s in self.categoriesTalk:
           subText += "" + s + " - "
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""
               
       mainText += "\n\n"
       mainText += "===Portals (count: " + str(len(self.portals)) + ")===\n"
       if splitting:
           subText += "\n\n===Portals===\n"
           mainText += "*Portals\n"
       for s in self.portals:
           subText += "" + s + " - "
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""

       for s in self.portalsTalk:
           subText += "" + s + " - "
       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""

       mainText += "\n\n"
       mainText += "===Images (count: " + str(len(self.images)) + ")===\n"
       if splitting:
           subText += "\n\n===Images===\n"
           mainText += "*Images\n"
       for s in self.images:
           subText += "" + s + " - "

       if not splitting:
           mainText += "\n"
           mainText += subText
           subText = ""

       if splitting:
           self.writeProjPage(self.articleOut + "/Page" + str(pageNo), subText)
           pageNo = pageNo+1
       
       self.writeProjPage(self.articleOut, mainText)

   def writeProjPage (self, pageName, text):
       pageName = "Wikipedia:WikiProject " + self.project + "/" + pageName
       comment = "full update by Ganeshbot"
       page = wikipedia.Page(wikipedia.getSite(), pageName)
       writePage(page, text, comment)

def organizeCategories (tag = "Numismaticnotice", topLevelCat = "Numismatics",

                       project = "Numismatics", pageName="Categories"):
   """
   organize the categories hierarchically
   write the results to "Wikipedia:WikiProject <project>/<page>"
   """
   # get the list of tagged categories
   dummy = []
   taggedCatList = []
   getTagged(tag, taggedCatList, dummy)
   print len(taggedCatList)

   text = "This is the category structure for [[Wikipedia:WikiProject " + \
          project + "|" + project + "]]
\n"
   
   cat = catlib.Category(wikipedia.getSite(), "Category:" + topLevelCat)
   text += "Category:"+topLevelCat+"
\n"
   text = organizeCatsNextLevel(text, cat, "|—", taggedCatList)

   page = wikipedia.Page(wikipedia.getSite(),
                         "Wikipedia:WikiProject " + project + "/" + pageName)
   writePage(page, text, "full update by Ganeshbot")

def organizeCatsNextLevel (text, cat, substring, taggedCatList):

   """
   recursively organize the category text
   text is the text so far, add to that
   cat is the catlib.Category of the previous level
   substring is the text to put before each category
   taggedCatList is the list of tagged categories
   returns the text so far
   """
   
   subcats = cat.subcategories()
   for subcat in subcats:
       # if this subcategory is included in our project
       if subcat.title() in taggedCatList:
           # if it has not already been listed (to prevent duplication,
           # but more importantly, to prevent infinite loops)
           if subcat.title() not in text:
               text += substring + "" + subcat.title() + "
\n"
               text = organizeCatsNextLevel(text, subcat,
                                            "|   "+substring,
                                            taggedCatList)
           else: # it's already been listed
               text += substring + "" + subcat.title() + " (already included, see above)
\n"
               # don't recurse in this case, to prevent infinite loops
   return text

def getExcluded (project):

   """
   get the list of pages which should not be tagged even though they're in
   tagged categories
   this can also be used to get excluded categories, if they're listed on
   the project exclusion page
   """
   page = wikipedia.Page(wikipedia.getSite(), "User:WatchlistBot/" + project)
   if page.exists():
       text = page.get()
       # find the "----" the list of articles is below the line
       start = text.find("----\n")
       result = text[start+4:].split("[[")
       pages = []
       for page in result:
           end = page.find("]]")
           if end != -1:
               pages.append(getTalkVersion(page[:end]))
       return pages
   return []

def getTalkVersion (name):

   """
   given a page name, convert it to the associated talk page
   """
   result = name.split(":")
   if len(result) == 1: #article
       return "Talk:"+name
   if len(result) == 3: #category
       return "Category talk:"+result[2]
   if "Talk" in result[0]:
       return name
   return result[0] + " talk:" + result[1]

def writePage (page, text, comment):

   if not DBG:

if wikipedia.getSite().messages:
wikipedia.output(u"Exiting -- you have message")
return False

       page.put(text, comment, minorEdit=False)
   else:
       pageName = page.title()
       start = pageName.find("/");
       if start != -1:
           pageName = pageName[start+1:]
       start = pageName.find("/");
       if start != -1:
           pageName = pageName[start+1:]
       start = pageName.find(":");
       if start != -1:
           pageName = pageName[start+1:]

1. page = wikipedia.Page(wikipedia.getSite(),
2. "User:mom2jandk/" + pageName)
3. page.put(text, comment, minorEdit=False)

       wikipedia.output(u"Writing file " + pageName + u".txt")
       f = codecs.open(pageName + ".txt", mode="w", encoding="utf8")
       f.write(text)
       f.close()
   return True

def untagPage (pageName, tag):

   """
   remove the tag from the given talk page, if it is there
   """
   page = wikipedia.Page(wikipedia.getSite(), pageName)
   if page.exists():
       if not page.isRedirectPage():
           text = page.get()
           tagStart = text.find("{{"+tag)
           if tagStart == -1:
               wikipedia.output("Page " + page.title() + " not tagged")
           else:
               # find the end of the tag (add 3 for the }}\n)
               tagEnd = text[tagStart:].find("}}") + tagStart + 3
               text = text[:tagStart] + text[tagEnd:]
               return writePage(page, text, "Removing " + tag)
   return True

def tagPage (pageName, tag, params = ""):

   """
   tag the given talk page with the tag
   params is an optional list of parameters for the tag (like class=Stub)
   """
   # get the talk page
   page = wikipedia.Page(wikipedia.getSite(), pageName)
   if page.exists():
       if not page.isRedirectPage():
           text = page.get()
           return tagIt(page, text, tag+params)
       else:
           wikipedia.output("Page " + page.title() + " is a redirect")
   else:
       # we don't mind if the page doesn't exist yet, just create it
       return tagIt(page, "", tag+params)
   return True

def tagIt (page, text, tag):

   text = "Template:" + tag + "\n\n" + text
   return writePage(page, text, "Adding " + tag)

def findArticlesInCategory (catName, articles, confirm = False,

                           includeCats = False):
   """
   find all the articles in the given category, and return a list
   If confirm is true, check each article with the user
   articles is the list so far
   includeCats indicates whether category talk pages should be included
   """

   # get the category (don't include it, since tagging articles and categories
   # is handled separately)
   cat = catlib.Category(wikipedia.getSite(), catName)

   # get all pages in this category
   pages = cat.articles()
   for page in pages:
       # if confirming, check
       if confirm:
           response = wikipedia.input(u"Do you want to tag " + page.title() + u"? (y for yes)")
       if not confirm or response == "y":    
           # add the appropriate prefix
           if page.namespace() == 10: # template
               articles.append("Template talk:" + page.titleWithoutNamespace())
           elif page.namespace() == 0: # article
               articles.append("Talk:" + page.title())
           elif page.namespace() == 6: # image
               articles.append("Image talk:" + page.titleWithoutNamespace())
           elif page.namespace() == 100: # portal
               articles.append("Portal talk:" + page.titleWithoutNamespace())
           elif page.namespace() == 4: # wikipedia
               articles.append("Wikipedia talk:" + page.titleWithoutNamespace())
           elif page.namespace() == 1 or \ # article talk
                 page.namespace() == 5 or \ # wikipedia talk
                 page.namespace() == 7 or \ # image talk
                 page.namespace() == 11 or \ # template talk
                 page.namespace() == 101: # portal talk
               articles.append(page.title())
           elif page.namespace() == 15: # category talk
               if includeCats:
                   articles.append(page.title())
           elif page.namespace() == 2 or \ # user
                page.namespace() == 3: # user talk
               # ignore these (dummy command)
               x = 1
           else:
               print "Unexpected namespace on " + page.title() + ": " + str(page.namespace())
   #remove duplicates
   articles = list(set(articles))

def updateCategoryList (catList, catName, taggedCats, otherTaggedCats,

                       keywords, excluded = [],
                       questionText = u"Do you want to tag ", confirm = True):
   """
   if catList starts with "", it means we're trying to quit, so just return
   starting at catName, make a list, catList, of all subcategories
   ask the user first, and allow the user the choice to recurse
   through subcategories
   taggedCats is the list of categories that are already tagged and can thus
   be skipped
   otherTaggedCats is the list (possibly empty) of categories that are
   tagged with a related tag -- these should be skipped, with no recursion
   keywords are words that if they're in the category, it will be tagged
   without confirmation
   excluded are categories to skip (treat as if user said 'n')
   if confirm is false, no confirmation question will be asked (all will be
   included)
   """
   # check if we're quitting
   if len(catList) > 1 and catList[0] == "":
       return catList

   cat = catlib.Category(wikipedia.getSite(), "Category:" + catName)
   response = "z"
   # if we have not already decided to tag this cat
   if catName not in catList:
       # if the categories is already in the taggedCats, treat that like a
       # "y" from the user
       if "Category:"+catName in taggedCats:
           response = "y"

       # if the category is in otherTaggedCats, treat it like a "n"
       if "Category:"+catName in otherTaggedCats:
           response = "n"
       elif "Category talk:"+catName in excluded:
           response = "n"
       else:
           # if the name has a keyword in it, treat that like a "y" from the user
           for keyword in keywords:
               if keyword in catName:
                   response = "y"
           
           # if confirm is False, treat it as if the user already said yes
           if confirm == False:
               response = "y"
       
       # if response is still "z", ask the user
       if response == "z":
           response = wikipedia.input(questionText + cat.title() + u"? (y for yes, yn for yes but no recursion, s for stop recursion)")

       if response == "s":
           # put "" into the catlist at the beginning as a marker
           catList.insert(0, "")
           return catList

       # add the category to the list
       if response == "y" or response == "yn":
           catList.append(cat.titleWithoutNamespace())
       
       # recurse through subcategories
       if response == "y":
           subcats = cat.subcategories()
           for subcat in subcats:
               updateCategoryList(catList, subcat.titleWithoutNamespace(),
                                  taggedCats, otherTaggedCats, keywords,
                                  excluded, questionText, confirm)
   return catList

def tagCategories (catName = "Electronics", tag = "Electron",

                  otherTag = "", project = "Electronics",
                  params = "|class=cat", keywords = []):
   """
   tag all categories in the specified category and subcategories with the
   specified tag (at the top of the page)
   if otherTag is not "", skip categories which are tagged with othertag
   check with the user for each category
   keywords are words that if they're in the category, it will be tagged
   without confirmation
   """
   wikipedia.put_throttle.setDelay(10, absolute = True)

   # get the list of categories which are already tagged
   taggedCatList = []
   taggedArticleList = []
   getTagged(tag, taggedCatList, taggedArticleList)

   otherTaggedCatList = []
   if otherTag != "":
       getTagged(otherTag, otherTaggedCatList, taggedArticleList)

   # get the list of categories and articles that are to be excluded (articles
   # will be ignored)
   excluded = getExcluded(project)

   # get the category list
   catList = []
   catList = updateCategoryList(catList, catName, taggedCatList, otherTaggedCatList,
                                keywords, excluded)

   # if the first element of catList is "", remove it, it was just a marker
   if catList[0] == "":
       catList.remove("")
   
   # remove duplicates and sort
   catList = sorted(set(catList))

   # remove categories which are already tagged
   for cat in catList:
       if "Category:"+cat not in taggedCatList:
           tagPage("Category talk:" + cat, tag, params)

def untagCategories (catList = [],

                    tag = "Electron", project = "Electronics"):
   """
   untag all specified categories
   """
   wikipedia.put_throttle.setDelay(10, absolute = True)

   for cat in catList:
       untagPage("Category talk:" + cat, tag)

def getTagged (tag, catList, articles):

   """
   get a list of categories and articles which contain the specified tag
   """
   page = wikipedia.Page(wikipedia.getSite(), "Template:" + tag)
   refs = page.getReferences(onlyTemplateInclusion=True)

   for page in refs:
       name = page.title()
       result = name.split(":")
       if result[0] == "Category talk":
           catList.append("Category:"+result[1])
       else:
           articles.append(name)

def untag (catList = [],

          tag = "Numismaticnotice",
          returnList = False):
   """
   remove the tag from all articles in the specified categories
   this is useful when the bot makes a mistake
   if returnList is true, just return a list, don't actually untag
   """
   articles = []
   for catName in catList:
       findArticlesInCategory("Category:"+catName, articles, False)
   articles = sorted(set(articles))
   if returnList:
       return articles
   else:
       for article in articles:
           untagPage(article, tag)
   wikipedia.stopme()

def classify (catName="Unassessed numismatic articles", tag="Numismaticnotice",

             comment="Numismatics assessment, class="):
   """
   go through all articles in the specified category and classify them as
   image, template, category, portal, or NA. Articles are left as is (as are
   lists and disambig pages)
   """
   articles = []
   findArticlesInCategory("Category:"+catName, articles, False, True)

   templatesToTag = []
   categoriesToTag = []
   imagesToTag = []
   portalsToTag = []

dabsToTag = []

   for article in articles:
       # if this is a template
       if "Template talk:" in article:
           templatesToTag.append(article)
       # if this is a category page
       if "Category talk:" in article:
           categoriesToTag.append(article)
       # if this is an image
       if "Image talk:" in article:
           imagesToTag.append(article)
       # if this is a portal
       if "Portal talk:" in article:
           portalsToTag.append(article)

# if this is a regular talk page, assume it's disambig
if "Talk:" in article:
dabsToTag.append(article)

   addParams(templatesToTag, "class", "template", tag, comment + "template")
   addParams(categoriesToTag, "class", "category", tag, comment + "category")
   addParams(imagesToTag, "class", "image", tag, comment + "image")
   addParams(portalsToTag, "class", "portal", tag, comment + "portal")

addParams(dabsToTag, "class", "dab", tag, comment + "dab")

def addParams (firstCat = "Unassessed Louisville articles",

              secondCat = "Louisville stubs",
              recurse = True,
              paramName = "class",
              paramValue = "Stub",
              tag = "WikiProject Louisville",
              comment = "Louisville assessment, adding class=Stub"):
   """
   find the articles in the intersection of firstCat and secondCat
   if recurse is true, include all subcats of secondCat (but not firstCat)
   paramName is the parameter to add (e.g., "class")
   paramValue is the value to assign (e.g., "NA")
   tag is the name of the template tag
   comment is the text to use for the comment when saving
   """

   # get the list of articles in the first category
   firstArticles = []
   findArticlesInCategory("Category:"+firstCat, firstArticles, False)

   # get the list of articles in the second category
   secondCatList = []
   secondCatList = updateCategoryList(secondCatList, secondCat, [], [],
                                      "Do you want to include ", False)
   secondArticles = []
   for cat in secondCatList:
       findArticlesInCategory("Category:"+cat, secondArticles, False)
   
   # get the list of articles that is in both
   articles = []
   for article in firstArticles:
       if article in secondArticles:
           articles.append(article)

   addParams(articles, paramName, paramValue, tag, comment)

def addParams (articles, paramName, paramValue, tag, comment):

   """
   articles is the list of articles to change
   paramName is the parameter to add (e.g., "class")
   paramValue is the value to assign (e.g., "NA")
   tag is the name of the template tag
   comment is the text to use for the comment when saving
   """

   for article in articles:
       page = wikipedia.Page(wikipedia.getSite(), article)
       text = page.get()

       # skip the first character so we don't have to worry about upper/lower
       tagStart = text.find(tag[1:])
       tagEnd = text[tagStart:].find("}}")
       tagEnd = tagStart + tagEnd
       paramStart = text[tagStart:tagEnd].find(paramName)
       if paramStart != -1:
           paramStart = tagStart + paramStart - 1
           paramEnd = text[paramStart+1:tagEnd].find("|")
           if paramEnd != -1:
               paramEnd = paramStart + paramEnd + 1
           else:
               paramEnd = tagEnd
       else:
           paramStart = tagEnd
           paramEnd = tagEnd
       text = text[:paramStart] + "|" + paramName + "=" + paramValue + \
           text[paramEnd:]
           
       if not writePage(page, text, comment):
           break

def replaceTag (oldTag="LouisvilleWikiProject", newTag="WikiProject Louisville"):

   """
   replace the oldTag with the newTag (can be used to replace a tag with
   a tag plus parameters)
   """
   articles = []
   getTagged(oldTag, [], articles)
   
   for article in articles:
       page = wikipedia.Page(wikipedia.getSite(), article)
       text = page.get()
       text = wikipedia.replaceExceptMathNowikiAndComments(
           text, oldTag, newTag)
       if not writePage(page, text, "replacing " + oldTag + " with " + newTag):
           break

def tag (tag = "Numismaticnotice", params = "", otherTag = "Exonumianotice",

        project = "Numismatics", confirm=False, catList = [],
        returnList = False, assessmentTag = "numismatic articles"):
   """
   tag articles in tagged categories
   if a page is already tagged with otherTag, skip it (use otherTag = "" for none)
   catList is a list of categories to check in. If empty, use tagged categories
   if params is given, include it after the tag, when tagging an article
   if returnList is true, don't actually tag anything, just return the list
     in this case, also don't skip a page just because it's already tagged
   assessmentTag is a text string contained in the assessment categories, use
     "" to ignore
   """

   # get the list of all tagged articles in taggedArticles
   # if catList was given, leave it as is. Otherwise, populate catList with
   #   all tagged categories
   taggedArticles = []
   if len(catList) == 0:
       getTagged(tag, catList, taggedArticles)
       # skip the assessment categories (otherwise, we won't skip articles
       # which are currently tagged but shouldn't be)
       newCatList = []
       for cat in catList:
           if assessmentTag != "" and \
              assessmentTag not in cat:
               newCatList.append(cat)
       catList = newCatList
   else:
       dummy = []
       getTagged(tag, dummy, taggedArticles)
       # put "Category:" in front of the category names
       newCatList = []
       for cat in catList:
           newCatList.append("Category:"+cat)
       catList = newCatList

   # add the articles tagged with otherTag to the list of taggedArticles
   if otherTag != "":
       getTagged(otherTag, [], taggedArticles)

   # get the list of untagged articles in the categories in catList (which
   # was either supplied as a parameter, or was populated with tagged categories)
   untaggedArticles = []
   for cat in catList:
       findArticlesInCategory(cat, untaggedArticles, confirm)

   # remove duplicates and sort
   untaggedArticles = dict.fromkeys(untaggedArticles).keys()
   untaggedArticles.sort()

   # if we're returning a list, stop here
   if returnList:
       return untaggedArticles

   # make a list of articles that need to be tagged (by removing articles
   # that are already tagged from list of all articles)
   for article in taggedArticles:
       if article in untaggedArticles:
           untaggedArticles.remove(article)

   # remove excluded articles
   excluded = getExcluded(project)
   for page in excluded:
       if page in untaggedArticles:
           untaggedArticles.remove(page)

   if len(untaggedArticles) == 0:
       wikipedia.output(u"No untagged articles")

   print "Tagging " + str(len(untaggedArticles)) + " articles"
   # tag the articles
   for article in untaggedArticles:
       tagPage(article, tag, params)

   wikipedia.stopme()

def fixWrongTags (catList = ["Coin games", "Electronic currencies",

                            "Digital currency exchangers",
                            "Digital gold currencies",
                            "Money", "Money stubs",
                            "Foreign exchange market", "Ancient mints",
                            "Challenge coin"]):
   """
   untag the articles in the specified categories, but only if they are
   not in other categories that require them to be tagged
   """
   # find articles that should be tagged
   needTagList = tag("Numismaticnotice", "", "Exonumianotice", "Numismatics",
                     False, [], True)

   # now get the list of articles to untag (returns all articles in the
   # specified categories, without checking if they're tagged)
   untagList = untag(catList, "Numismaticnotice", True)

   # if an article is in the untagList and not in the needTagList, untag it
   for article in untagList:
       if article not in needTagList:
           untagPage(article, "Numismaticnotice")

def findDoubleTags (catList = []):

   """
   find articles that are in numismatics as well as exonumia categories
   """
   
   # find articles that think they should be tagged Exonumia and Numismaticnotice
   numArticles = tag("Numismaticnotice", "", "", "Numismatics", False, [], True)
   getTagged("Numismaticnotice", [], numArticles)
   exoArticles = tag("Exonumianotice", "", "", "Numismatics", False, [], True)
   getTagged("Exonumianotice", [], exoArticles)
   bothArticles = []
   for article in numArticles:
       if article in exoArticles:
           bothArticles.append(article)
   text = ""
   for article in bothArticles:
       text += "*"+article+"
\n"
   print text
   wikipedia.stopme()

def listProjects ():

   """
   print out a list of active projects, with numbers to use for an individual update
   """
   for proj,x in enumerate(projects):
       print str(proj) + ": " + x

def main():

   """
   update the project watchlists. If projectNum is given, only update the
   given project number (see projects for list, remember to start at 0)
   """

projects = ["Yorkshire"]
templates = ["WikiProject Yorkshire"]
articleOuts = ["Articles"]
includePagesLists = [],[]
taggedPagesFlags = [False]
taggedCategoriesFlags = [False]
inCategoryFlags = [True]
runProjects = [True]

   projects = ["Yorkshire"]
   templates = ["WikiProject Yorkshire"]
   articleOuts = ["Articles"]    
   includePagesLists = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]                            
   taggedPagesFlags = [True]
   taggedCategoriesFlags = [False]
   inCategoryFlags = [False]
   runProjects = [True]

   projectNums = range(len(projects))
   for i in projectNums:
       template, project = templates[i], projects[i]
       articleOut, includePagesList = articleOuts[i], includePagesLists[i]
       taggedPagesFlag, taggedCategoriesFlag, inCategoryFlag  = taggedPagesFlags[i], taggedCategoriesFlags[i], inCategoryFlags[i]
       runProject = runProjects[i]
       
       if runProject:
           print "Updating watchlist for: %s using template: %s. Saving to: %s" \
                 % (project, template, articleOut)
           wl = Watchlist(project,template,articleOut,includePagesList)

           if taggedPagesFlag:
               wl.getTaggedPages()
               
           if taggedCategoriesFlag:
               wl.getPagesFromTaggedCategories()
               
           if inCategoryFlag:
               wl.getPagesFromCategory()

           wl.writeList(taggedPagesFlag)

   wikipedia.stopme()

if __name__ == "__main__":

   try:
       main()
   except:
       wikipedia.stopme()
       raise
   wikipedia.stopme()

</syntaxhighlight>