# -*- coding: utf-8 -*-
"""
Script to resolve double redirects, and to delete broken redirects.
Requires access to MediaWiki's maintenance pages or to a SQL dump file. Delete function requires
adminship.
Syntax:
python redirect.py action [argument]
where action can be one of these:
* double - fix redirects which point to other redirects
* broken - delete redirects where targets don\'t exist. Requires adminship.
and argument can be:
* sql - retrieve information from a local dump (http://download.wikimedia.org).
if this argument isn't given, info will be loaded from the maintenance page of
the live wiki.
argument can also be given as "-sql:filename.sql".
NOTE: For resolving redirects, please use solve_disambiguation.py -redir.
"""
#
# (C) Daniel Herding, 2004
#
# Distributed under the terms of the PSF license.
#
__version__='$Id: redirect.py,v 1.12 2004/10/11 12:28:15 jeedo Exp $'
#
from __future__ import generators
import wikipedia, config
import re, sys
# Summary message for fixing double redirects
msg_double={
'en':u'Robot: Fixing double redirect',
'de':u'Bot: Korrigiere doppelten Redirect',
'is':u'Vélmenni: Lagfæri tvöfalda tilvísun',
'ar':u'روبوت: اصلاح ازدوادية في اعادة التحويل',
}
# Reason for deleting broken redirects
reason_broken={
'en':u'Robot: Redirect target doesn\'t exist',
'de':u'Bot: Weiterleitungsziel existiert nicht',
}
def get_hamza_normalized_titles(sqlfilename):
'''
Loads a local sql dump file, looks at all pages which have hamza in its title
and creates a new name for it.
Returns a dictionary where the normalized name is thr keys and the redirect
targets are the values.
NOTE: if the redirect isn't in the main namespace, the returned key will be
prefixed by the default namespace identifiers. See full_title() in dump.py.
'''
dict = {}
# open sql dump and read page titles out of it
dump = sqldump.SQLdump(sqlfilename, wikipedia.myencoding())
newArticleName=u""
for entry in dump.entries():
fullArticleName = entry.full_title()
newArticleName = fullArticleName.replace(u'أ',u'ا')
newArticleName = newArticleName.replace(u'إ',u'ا')
newArticleName = newArticleName.replace(u'آ',u'ا')
if newArticleName != fullArticleName:
dict[newArticleName] = fullArticleName
return dict
def get_harakat_normalized_titles(sqlfilename):
'''
Loads a local sql dump file, looks at all pages which have hamza in its title
and creates a new name for it.
Returns a dictionary where the normalized name is thr keys and the redirect
targets are the values.
NOTE: if the redirect isn't in the main namespace, the returned key will be
prefixed by the default namespace identifiers. See full_title() in dump.py.
'''
dict = {}
# open sql dump and read page titles out of it
dump = sqldump.SQLdump(sqlfilename, wikipedia.myencoding())
newArticleName=u""
for entry in dump.entries():
fullArticleName = entry.full_title()
newArticleName = re.sub(u'[ًَُّ~ِْ]','',fullArticleName )
#newArticleName = re.sub(u'(َ)','',fullArticleName )
#newArticleName = fullArticleName.replace(u'َ',u'')
#newArticleName = newArticleName.replace(u'ً',u'')
#newArticleName = newArticleName.replace(u'ّ',u'')
#newArticleName = newArticleName.replace(u'ُ',u'')
#newArticleName = newArticleName.replace(u'ٌ',u'')
#newArticleName = newArticleName.replace(u'ْ'u'')
#newArticleName = newArticleName.replace(u'ِ',u'')
#newArticleName = newArticleName.replace(u'ٍ',u'')
if newArticleName != fullArticleName:
print newArticleName +' '+fullArticleName
dict[newArticleName] = fullArticleName
return dict
def create_normalized_hamza_redirects(source):
for article_title , articlefullTitle in get_hamza_normalized_titles(source).iteritems():
target = wikipedia.PageLink('ar', article_title)
if not target.exists():
pl = wikipedia.PageLink('ar',article_title)
if article_title.replace(u':',u'X') == article_title:
pl.put('#تحويل [['+articlefullTitle+']]','#تحويل [['+articlefullTitle+']]')
print article_title +'--> ' + articlefullTitle
wikipedia.put_throttle()
#print get_hamza_normalized_titles(source)
def create_normalized_harakat_redirects(source):
for article_title , articlefullTitle in get_harakat_normalized_titles(source).iteritems():
pl = wikipedia.PageLink('ar',article_title)
#target = wikipedia.PageLink(wikipedia.mylang, article_title)
if article_title.replace(':','X') == article_title:
if not pl.exists():
pl.put('#تحويل [['+articlefullTitle+']]','#تحويل [['+articlefullTitle+']]')
print article_title +' --> ' + articlefullTitle
wikipedia.put_throttle()
else:
print article_title + 'already exists'
if pl.isRedirectPage():
if unicode(pl.getRedirectTo(),'utf-8') != articlefullTitle:
print 'hmm, youve got trouble in article ' + article_title
def fix_double_redirects(source):
for redir_name in retrieve_double_redirects(source):
print ''
redir = wikipedia.PageLink(wikipedia.mylang, redir_name)
try:
target = redir.getRedirectTo()
except wikipedia.IsNotRedirectPage:
wikipedia.output(u'%s is not a redirect.' % redir.linkname())
except wikipedia.NoPage:
wikipedia.output(u'%s doesn\'t exist.' % redir.linkname())
except wikipedia.LockedPage:
wikipedia.output(u'%s is locked, skipping.' % redir.linkname())
else:
try:
second_redir = wikipedia.PageLink(wikipedia.mylang, target)
second_target = second_redir.getRedirectTo(read_only = True)
except wikipedia.IsNotRedirectPage:
wikipedia.output(u'%s is not a redirect.' % second_redir.linkname())
except wikipedia.NoPage:
wikipedia.output(u'%s doesn\'t exist.' % second_redir.linkname())
else:
txt = "#تحويل [[%s]]" % second_target
redir.put(txt)
# read command line parameters
# what the bot should do (either resolve double redirs, or delete broken redirs)
action = None
# where the bot should get his infos from (either None to load the maintenance
# special page from the live wiki, the filename of a local sql dump file)
source = None
for arg in sys.argv[1:]:
arg = wikipedia.argHandler(arg)
if arg:
if arg == 'normalize':
action = 'normalize'
elif arg == 'normalize_hamza':
action = 'normalize_hamza'
elif arg == 'normalize_harakat':
action = 'normalize_harakat'
elif arg.startswith('-sql'):
if len(arg) == 4:
sqlfilename = wikipedia.input(u'Please enter the SQL dump\'s filename: ')
else:
sqlfilename = arg[5:]
import sqldump
source = sqlfilename
else:
print 'Unknown argument: %s' % arg
if action == 'normalize_hamza':
# get summary text
wikipedia.setAction(wikipedia.translate('ar', msg_double))
create_normalized_hamza_redirects(source)
elif action == 'normalize_harakat':
create_normalized_harakat_redirects(source)
else:
wikipedia.output(__doc__, 'utf-8')