mirror of
https://github.com/moparisthebest/rswiki
synced 2024-12-21 07:08:55 -05:00
Add hopefully final mw2gollum.py which was used to convert full history
This commit is contained in:
parent
d93c62b0d7
commit
98b33b232d
13
README.md
13
README.md
@ -2,9 +2,14 @@ This is the wiki hosted at rswiki.moparisthebest.com, in the wiki git repo.
|
||||
|
||||
In this repo you'll find the tools we used for converting from mediawiki to gollum:
|
||||
|
||||
* legit_pages.py was written by vortex, and used to scrape and generate legit_pages.txt, which was used to export RSWiki-20150610160818.xml
|
||||
* mw-to-gollum.rb was slightly modified from here: https://gist.github.com/MasterRoot24/ab85de0e7b82ba7f5974
|
||||
* mediawiki2gollum.sh uses mw-to-gollum.rb to convert the mediawiki xml, then does various things to clean up all the links and names so they will work
|
||||
* legit_pages.py was written by vortex, and used to scrape and generate legit_pages.txt, which was used to export RSWiki-*.xml
|
||||
* mw2gollum.py converts a mediawiki dump to gollum preserving all contributor names, changes, and timestamps, each change is a seperate commit
|
||||
* mwbashclean.sh is run before every commit mw2gollum.py makes
|
||||
* category.sh scrapes and generates category pages like mediawiki, needs to be ran whenever pages are added to categories
|
||||
|
||||
todo: historical versions not converted/saved yet
|
||||
Deprecated because they do not preserve history:
|
||||
|
||||
* mw-to-gollum.rb was slightly modified from here: https://gist.github.com/MasterRoot24/ab85de0e7b82ba7f5974 DEPRECATED USE mw2gollum.py instead which preserves history!
|
||||
* mediawiki2gollum.sh uses mw-to-gollum.rb to convert the mediawiki xml, then does various things to clean up all the links and names so they will work DEPRECATED USE mw2gollum.py instead which preserves history!
|
||||
|
||||
todo: script to generate nginx redirects?
|
253462
RSWiki-20150610171636.xml
Normal file
253462
RSWiki-20150610171636.xml
Normal file
File diff suppressed because it is too large
Load Diff
69
mw2gollum.py
Normal file
69
mw2gollum.py
Normal file
@ -0,0 +1,69 @@
|
||||
#from time import strptime, strftime
|
||||
import os
|
||||
import codecs
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def convert(dump, folder, clean_script, user_map):
|
||||
soup = BeautifulSoup(open(dump, 'r'))
|
||||
commits = []
|
||||
for page in soup.find_all('page'):
|
||||
title = page.title.string.replace(':', ' ').replace('_', '-').replace(' ', '-')
|
||||
if title == 'Main-Page':
|
||||
title = 'Home'
|
||||
#print title
|
||||
for revision in page.find_all('revision'):
|
||||
contributor = revision.contributor.username.string
|
||||
# timestamp like 2011-10-07T21:11:55Z
|
||||
timestamp_str = revision.timestamp.string
|
||||
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%SZ')
|
||||
#print '\t', contributor
|
||||
#print '\t', timestamp
|
||||
text = revision.find('text').string
|
||||
#print '\t\t', text
|
||||
commits.append({
|
||||
'title' : title,
|
||||
'contributor' : contributor,
|
||||
'timestamp' : timestamp,
|
||||
'timestamp_str' : timestamp_str,
|
||||
'text' : text,
|
||||
})
|
||||
#break
|
||||
commits.sort(key=lambda r: r['timestamp'])
|
||||
#print commits
|
||||
os.mkdir(folder)
|
||||
os.chdir(folder)
|
||||
os.system('git init .')
|
||||
for commit in commits:
|
||||
#print commit
|
||||
fname = commit['title'] + '.mediawiki'
|
||||
update = 'Update' if os.path.exists(fname) else 'Create'
|
||||
print "author: '%s', date: '%s', title: '%s'" % (commit['contributor'], commit['timestamp_str'], commit['title'].replace('-', ' '))
|
||||
if commit['text']:
|
||||
with codecs.open(fname, 'w', 'utf-8') as page_file:
|
||||
page_file.write(commit['text'])
|
||||
if clean_script:
|
||||
os.system(clean_script + ' "' + fname + '"')
|
||||
elif update == 'Update':
|
||||
os.remove(fname)
|
||||
update = 'Remove'
|
||||
#print "author: '%s', date: '%s', title: '%s'" % args
|
||||
os.system('git add --all .')
|
||||
os.system('git commit --author="%s" --date="%s" -m "%s MediaWiki page \'%s\'"' % (user_map.get(commit['contributor'], commit['contributor'] + ' <' + commit['contributor'].replace(' ', '_') + '@rswiki.moparisthebest.com>'), commit['timestamp_str'], update, commit['title'].replace('-', ' ')))
|
||||
|
||||
user_map = {
|
||||
'Admin': 't4 <t4@rswiki.moparisthebest.com>',
|
||||
'Ambokile': 'Jameskmonger <jameskmonger@hotmail.co.uk>',
|
||||
'Arham 4': 'Arham Siddiqui <tryusyo@yahoo.com>',
|
||||
'AtomicInt': 'Ryley Kimmel <ryley.kimmel@live.com>',
|
||||
'Graham': 'Graham Edgecombe <grahamedgecombe@gmail.com>',
|
||||
'Major': 'Major- <major@emulate.rs>',
|
||||
'Moparisthebest': 'moparisthebest <admin@moparisthebest.com>',
|
||||
'Pure': 'Pure_ <mail@pure2.net>',
|
||||
'Sini': 'Sini <hadyn.richard@gmail.com>',
|
||||
}
|
||||
|
||||
dump = 'RSWiki-20150610160818.xml'
|
||||
dump = 'RSWiki-20150610171636.xml'
|
||||
|
||||
convert(dump, './pywiki/', '../mwbashclean.sh', user_map)
|
34
mwbashclean.sh
Executable file
34
mwbashclean.sh
Executable file
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
new_file="$1"
|
||||
|
||||
# Fix links: Remove leading :
|
||||
sed -ri 's/\[\[:([^]]*)\]\]/[[\1]]/g' "$new_file"
|
||||
|
||||
# Fix links: Change underscores and colons to spaces where there is a link name
|
||||
grep -ho '\[\[[^|]*|' "$new_file" | sort | uniq | grep '[_:]' | while read line
|
||||
do
|
||||
sed_line=$(echo "$line" | sed -e 's/\[/\\[/g')
|
||||
rep_line=$(echo "$line" | tr ':' ' ' | tr '_' ' ')
|
||||
sed -i "s/$sed_line/$rep_line/g" "$new_file"
|
||||
done
|
||||
|
||||
# Fix links: Change colons to spaces where there is no link name
|
||||
grep -ho '\[\[[^:]*:[^]]*\]\]' "$new_file" | sort | uniq | while read line
|
||||
do
|
||||
sed_line=$(echo "$line" | sed -e 's/\[/\\[/g')
|
||||
rep_line=$(echo "$line" | tr ':' ' ')
|
||||
sed -i "s/$sed_line/$rep_line/g" "$new_file"
|
||||
done
|
||||
|
||||
# Add categories to packets
|
||||
echo "$new_file" | grep '^194-' && sed -i '1i [[Category Packet]]\n[[Category Packet 194]]' "$new_file"
|
||||
echo "$new_file" | grep '^317-' && sed -i '1i [[Category Packet]]\n[[Category Packet 317]]' "$new_file"
|
||||
echo "$new_file" | grep '^377-' && sed -i '1i [[Category Packet]]\n[[Category Packet 377]]' "$new_file"
|
||||
echo "$new_file" | grep '^474-' && sed -i '1i [[Category Packet]]\n[[Category Packet 474]]' "$new_file"
|
||||
echo "$new_file" | grep '^718-' && sed -i '1i [[Category Packet]]\n[[Category Packet 718]]' "$new_file"
|
||||
|
||||
# Add category.sh and generate initial category pages
|
||||
[ ! -e ./category.sh ] && cp ../category.sh ./
|
||||
./category.sh
|
||||
|
||||
#/home/mopar/apps/rbenv/versions/1.9.3-p392/bin/gollum --no-edit
|
Loading…
Reference in New Issue
Block a user