Add tools to convert

This commit is contained in:
Travis Burtrum 2015-06-09 22:57:26 -04:00
parent a729ec2114
commit d93c62b0d7
9 changed files with 18047 additions and 1 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
wiki/

1
.ruby-version Normal file
View File

@ -0,0 +1 @@
1.9.3-p392

View File

@ -1 +1,10 @@
This is the wiki hosted at rswiki.moparisthebest.com
This is the wiki hosted at rswiki.moparisthebest.com, in the wiki git repo.
In this repo you'll find the tools we used for converting from mediawiki to gollum:
* legit_pages.py was written by vortex, and used to scrape and generate legit_pages.txt, which was used to export RSWiki-20150610160818.xml
* mw-to-gollum.rb was slightly modified from here: https://gist.github.com/MasterRoot24/ab85de0e7b82ba7f5974
* mediawiki2gollum.sh uses mw-to-gollum.rb to convert the mediawiki xml, then does various things to clean up all the links and names so they will work
* category.sh scrapes and generates category pages like mediawiki, needs to be ran whenever pages are added to categories
todo: historical versions not converted/saved yet

17657
RSWiki-20150610160818.xml Normal file

File diff suppressed because it is too large Load Diff

52
category.sh Executable file
View File

@ -0,0 +1,52 @@
#!/bin/bash
cd "$(dirname "$0")"
function prepareFile(){
path="$1"
tmp_path="$(basename "$path").tmp"
(
grep 'CODE AUTOMATICALLY GENERATED BY category.sh -->' "$path" &>/dev/null
if [ $? -eq 0 ]
then
sed -n '/CODE AUTOMATICALLY GENERATED BY category.sh -->/q;p' "$path"
else
cat "$path" && echo
fi
echo -e '<!-- DO NOT EDIT BELOW THIS LINE, OR CHANGE THIS COMMENT, CODE AUTOMATICALLY GENERATED BY category.sh -->'
) > "$tmp_path"
echo "$tmp_path"
}
function finishFile(){
path="$1"
tmp_path="$(basename "$path").tmp"
mv "$tmp_path" "$path"
}
files_lines_categories="$(grep '^\[\[Category [^]./\]*\]\]$' *)"
#echo "files_lines_categories: $files_lines_categories"
echo '<!-- DO NOT EDIT THIS FILE, CODE AUTOMATICALLY GENERATED BY category.sh -->' > Categories.mediawiki
echo 'The following categories contain pages or media.' >> Categories.mediawiki
echo "$files_lines_categories" | grep -o 'Category [^]]*' | sort | uniq | while read category
do
#echo "category: $category"
category_file="$(echo "$category" | sed 's/ /-/g').mediawiki"
echo "category_file: $category_file"
result="$(prepareFile "$category_file")"
echo "== '''Pages in category \"$(echo $category | sed 's/Category //')\"''' ==" >> "$result"
files="$(echo "$files_lines_categories" | grep ":\[\[${category}\]\]$" | sort)"
num_pages="$(echo "$files" | wc -l)"
echo -e "* [[$category]] ($num_pages members)" >> Categories.mediawiki
echo "The following $num_pages pages are in this category." >> "$result"
echo "$files" | while read file
do
echo "* [[$(echo $file | sed -e "s/\.mediawiki:\[\[${category}\]\]$//" -e 's/-/ /g')]]"
done >> "$result"
finishFile "$category_file"
done

34
legit_pages.py Normal file
View File

@ -0,0 +1,34 @@
import requests, logging
from bs4 import BeautifulSoup
from queue import Queue
from threading import Thread
logging.getLogger("requests").setLevel(logging.WARNING)
def get_pages(page):
global all_pages
if page in all_pages:
return
all_pages.append(page)
ebin.write(page + "\n")
print(page + "\n")
req = requests.get("https://rswiki.moparisthebest.com/index.php?title=" + page)
soup = BeautifulSoup(req.text)
content = soup.find("div", id="mw-content-text")
links = content.find_all("a")
for link in links:
link = link["href"]
off = link.find("?title=")
if off is -1:
continue
next_page = link[off + 7:]
if "#" in next_page:
next_page = next_page[:next_page.find("#")]
if "&" not in next_page and "Special:" not in next_page and next_page != page:
get_pages(next_page)
ebin = open("legit_pages.txt", "w")
all_pages = []
pages = get_pages("Main_Page")

190
legit_pages.txt Normal file
View File

@ -0,0 +1,190 @@
Main_Page
Rules
DMCA_Policy
RSWiki:General_disclaimer
RSWiki:Privacy_policy
RSWiki_IRC
RSWiki:About
Category:RSC
135_Protocol
202_Protocol
204_Items
204_NPCs
204_Objects
204_Protocol
OB3
Category:RS2
194_Protocol
194:Show_interface
Data_Types
Word
DWord
QWord
RS_String
194:Logout
194:Clear_screen
289_Protocol
289:Send_sidebar_interface
317:Send_Skill
289:Send_Player_Head
317:Friends_list_status
289:Construct_Map_Region
317_Protocol
Category:Packet:317
289:Player_Dialogue_Head
317:Add_friend
317:Add_ignore
317:Alternate_item_option_2
317:Animation_reset
317:Attack_(NPC)
317:Audio
317:Bank_10_items
317:Bank_5_items
317:Bank_all_items
317:Bank_X_items_part-1
317:Bank_X_items_part-2
317:Button_click
317:Camera_movement
317:Camera_shake
317:Chat_interface
317:Chat_interface_click
317:Chat_settings
317:Clear_inventory
317:Clear_screen
317:Close_window
317:Construct_map_region
317:Design_screen
317:Display_hint_icon
317:Drop_item
317:Enter_name
317:Equip_item
317:Flash_sidebar
317:Focus_change
317:Follow
317:Force_client_setting
317:Hidden_Interface
317:Idle
317:Idle_logout
317:Initialize_player
317:Input_amount
317:Interface_animation
317:Interface_color
317:Interface_item
317:Interface_model_rotation
317:Interface_offset
317:Interface_over_tab
317:Inventory_overlay
317:Item_action_1
317:Item_on_floor
317:Item_on_item
317:Item_on_object
317:Item_on_player
317:Light_item
317:Load_map_region
317:Loading_finished
317:Logout
317:Magic_on_items
317:Magic_on_player
317:Minimap_State
317:Move_item
317:Music
317:NPC_action_1
317:NPC_action_2
317:NPC_action_3
317:NPC_head_on_interface
317:Object_action_1
317:Object_action_2
317:Object_action_3
317:Open_chatbox_interface
317:Open_welcome_screen
317:Pickup_ground_item
317:Play_song
317:Player_command
317:Player_head_to_interface
317:Privacy_options
317:Region_change
317:Remove_friend
317:Remove_ignore
317:Report_player
317:Reset_button_state
317:Reset_camera
317:Reset_destination
317:Run_energy
317:Scroll_position
317:Send_add_ignore
317:Send_message
317:Send_sidebar_interface
317:Set_interface_text
317:Show_interface
317:Show_multi-combat
317:Show_tab
317:Skill_level
317:Song_Queue
317:System_update
317:Trade_answer
317:Trade_request
317:Unequip_item
317:Update_item_container
317:Walkable_interface
317:Weight
317:Send_add_friend
317:Begin_player_updating
317:Object_removal
317:Create_Projectile
317:Object_spawn
317:Send_private_message
317:NPC_Dialogue
317:Mouse_click
317:Ground_Item_Action
357_Protocol
377_Protocol
377:Interface_Animation
377:Move_Camera
377:Logout
377:Send_Sidebar_Interface
377:Animation_Reset
377:Interface_Item
377:Send_Sound
377:Reset_Ground_Items_and_Objects
377:Play_Ambient_Wave
377:Skill_Level
377:Walkable_Interface
377:Construct_Map_Region
377:Input_Amount
377:Create_Static_Graphic
377:Send_Message
377:Camera_Shake
377:Open_Welcome_Screen
377:Send_Add_Friend
377:Send_Ground_Item
377:Reset_Button_State
377:Run_Energy
377:Initialize_Player
377:Inventory_Overlay
443_Protocol
Class_Check
464_Protocol
468_Protocol
474_Protocol
474:Remove_ignore
474:Fourth_Interface_Option
508_Protocol
634_Protocol
666_Protocol
668_Protocol
718_Protocol
718:Friends_packet
718:Close_window
718:Player_under_NPC_priority
718:Music_effect
718:Interface
718:Open_URL
742_Protocol
Category:RS3
Category:Cache
Archive_Format
JAGGRAB_Protocol
Ondemand_Protocol
Map_Region_System
317:Mage_NPC
317:Player_Option

51
mediawiki2gollum.sh Executable file
View File

@ -0,0 +1,51 @@
#!/bin/bash
cd "$(dirname "$0")"
commit(){
git add .
git commit -m "$1"
}
rm -rf ./wiki/
ruby mw-to-gollum.rb -f RSWiki-20150610160818.xml -d ./wiki/
cd ./wiki/
cp ../.ruby-version ./
commit 'Added .ruby-version'
cp Main-Page.mediawiki Home.mediawiki
commit 'Copied Main-Page to Home'
sed -ri 's/\[\[:([^]]*)\]\]/[[\1]]/g' *
commit 'Fix links: Remove leading :'
grep -ho '\[\[[^|]*|' * | sort | uniq | grep '[_:]' | while read line
do
sed_line=$(echo "$line" | sed -e 's/\[/\\[/g')
rep_line=$(echo "$line" | tr ':' ' ' | tr '_' ' ')
sed -i "s/$sed_line/$rep_line/g" *
done
commit 'Fix links: Change underscores and colons to spaces where there is a link name'
grep -ho '\[\[[^:]*:[^]]*\]\]' * | sort | uniq | while read line
do
sed_line=$(echo "$line" | sed -e 's/\[/\\[/g')
rep_line=$(echo "$line" | tr ':' ' ')
sed -i "s/$sed_line/$rep_line/g" *
done
commit 'Fix links: Change colons to spaces where there is no link name'
sed -i '1i [[Category Packet]]' 194-* 317-* 377-* 474-* 474-*
sed -i '1i [[Category Packet 194]]' 194-*
sed -i '1i [[Category Packet 317]]' 317-*
sed -i '1i [[Category Packet 377]]' 377-*
sed -i '1i [[Category Packet 474]]' 474-*
sed -i '1i [[Category Packet 718]]' 718-*
commit 'Add categories to packets'
cp ../category.sh ./
./category.sh
commit 'Add category.sh and generate initial category pages'
/home/mopar/apps/rbenv/versions/1.9.3-p392/bin/gollum --no-edit

51
mw-to-gollum.rb Normal file
View File

@ -0,0 +1,51 @@
require 'rubygems'
require 'hpricot'
require 'gollum'
require 'gollum-lib'
require 'optparse'
require 'git'
# from https://gist.github.com/MasterRoot24/ab85de0e7b82ba7f5974
# gem install hpricot gollum git wikicloth
# Parse command line options
# ToDo: Make command line options mandatory
options = {}
OptionParser.new do |opts|
opts.banner = 'Usage: ruby mw-to-gollum.rb --file input-file.xml --directory new.wiki'
opts.on('-f FILE', '--file FILE', 'MediaWiki export file to import') do |v|
options[:file] = v
end
opts.on('-d DIRECTORY', '--directory DIRECTORY', 'Destination directory in which to create a new Gollum wiki') do |v|
options[:destination] = v
end
end.parse!
# Open the input file and create the output repo if it doesn't already exist
file = File.open(options[:file], 'r')
git = Git.init(options[:destination])
wiki = Gollum::Wiki.new(options[:destination])
doc = Hpricot(file)
# Get the Git user name and email
name = git.config('user.name')
email = git.config('user.email')
# Loop through each page in the MediaWiki dump file and create a new page in the Gollum wiki
doc.search('/mediawiki/page').each do |el|
title = el.at('title').inner_text.tr(":", " ")
content = el.at('text').inner_text
commit = { :message => "Import MediaWiki page #{title} into Gollum",
:name => name,
:email => email}
begin
puts "Writing page #{title}"
wiki.write_page(title, :mediawiki, content, commit)
rescue Gollum::DuplicatePageError
puts "Duplicate #{title}"
rescue Exception
puts $!, $@
end
end
file.close