User:PMBot/Code
From Proteopedia
Following is the preliminary source code. It reads topic page nameas from Proteopedia:Topic Pages and outputs a list of what would be written when it would have been the first pass. Only topic pages are read, no structures are changed.
# -*- coding: utf-8 -*-
from __future__ import with_statement # This isn't required in Python 2.6
"""
pmbot [OPTIONS]
Goes through all topic pages looking for the usage of non-uploaded
structure pages (official PDBs) in scenes. Each of these PDB pages
is edited such that it contains in the section named "About this
Structure" the string
<!-- PMBot Start -->
"The page TOPICPAGE refers to 1ABC."
or "The pages TOPICPAGES refer to 1ABC."
<!-- PMBot End -->
where TOPICPAGE is a topic page link, TOPICPAGES is a comma-separated
list of topic page links, and 1ABC is the name of the respective structure
page. If such a string exists, it is actualized.
Options:
At the moment, there are no options.
"""
#
# (C) R Stephan 2009
#
# Distributed under the terms of the GPL2.
#
__version__ = '0.10'
#
import wikipedia,re,sys,config
import catlib,traceback,itertools
wikipedia.get_throttle.setDelay(5)
#wikipedia.put_throttle.setDelay(10)
msg={
'en': 'pmbot: maintenance of structure references',
}
def main():
Rco = re.compile (u'<!--(?:.(?<!--))*-->')
Rt1 = re.compile (u'(?<=\[\[)[^\]]+(?=\]\])')
Rt2 = re.compile (u' *\|.*')
Rst = re.compile (u'(?<=STRUCTURE_)[1-9][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]')
Rap = re.compile (u'(?<=<applet load=.)[1-9][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z](?=[\'\"])')
Rta = re.compile (u"Start of topic pages.*End of topic pages. Please DON'T REMOVE -->", re.DOTALL)
site = wikipedia.getSite()
dic = {}
# Try to read a topic pages list.
pagename = 'Proteopedia:Topic_Pages'
alltopics = wikipedia.Page (site, pagename)
try:
temp_text = alltopics.get (False, True)
except wikipedia.NoPage:
print 'NoPage exception when trying to read topic page list'
return
# with codecs.open('Topic_Pages.txt', encoding='utf-8') as f: temp_text = f.read()
m = Rta.search(temp_text)
if m == None:
print 'Topic list markers not found.'
return
alltopics_text = Rco.sub (u'', m.group(0))
topicsIter = Rt1.finditer (alltopics_text)
c = 0
for topicmatch in topicsIter:
# if c>2: break
c = c+1
t = topicmatch.group(0)
topicname = Rt2.sub (u'', t)
# TODO: check if already loaded before
loaded = False;
while not loaded:
sys.stdout.flush()
print 'Retrieving ' + topicname.encode ('ascii', 'xmlcharrefreplace')
sys.stdout.flush()
topic = wikipedia.Page (site, topicname)
try:
loaded = True
topic_text = topic.get()
except wikipedia.NoPage:
print 'NoPage exception when trying to read ' + topicname.encode ('ascii', 'xmlcharrefreplace')
loaded = False
break
except wikipedia.SectionError:
print 'Subject does not exist: ' + topicname.encode ('ascii', 'xmlcharrefreplace')
topicname = re.sub (ur"#.*", '', topicname)
loaded = False
continue
except wikipedia.IsRedirectPage, inst:
topicname = inst.args[0]
print 'Redirected to ' + topicname.encode ('ascii', 'xmlcharrefreplace')
loaded = False
continue
# print topic_text.encode('utf-8')
if not loaded: continue
links = itertools.chain (
Rt1.finditer (topic_text),
Rst.finditer (topic_text),
Rap.finditer (topic_text))
for linkmatch in links:
l = linkmatch.group(0)
linkname = Rt2.sub ('', l)
if linkname[0]>'0' and linkname[0]<='9':
if linkname in dic:
s = dic[string.lower(linkname)]
else:
s = set()
s.add (topicname.encode('ascii', 'xmlcharrefreplace'))
dic[string.lower(linkname)] = s
print 'Number of topics read: ', c
print 'Number of structures to read/write: ', len(dic)
sys.stdout.flush()
print dic
if __name__ == '__main__':
for arg in wikipedia.handleArgs():
# - TODO: flag to switch from applet to scene backlinks to link backlinks
# - TODO: add option to search scene files
# - TODO: option to restrict number of topics read (c)
# if arg.startswith("-p:"):
# if (len(arg)) == len("-p:"):
# pred = u"refers to"
# else:
# pred = arg[len("-p:"):]
try:
main()
except:
print 'Something wrong.'
traceback.print_exc()
finally:
print 'Stop.'
wikipedia.stopme()
