# Sitemap.xml Generator is a Jekyll plugin that generates a sitemap.xml file by # traversing all of the available posts and pages. # # How To Use: # 1) Copy source file into your _plugins folder within your Jekyll project. # 2) Change modify the url variable in _config.yml to reflect your domain name. # 3) Run Jekyll: jekyll --server to re-generate your site. # # Variables: # * Change SITEMAP_FILE_NAME if you want your sitemap to be called something # other than sitemap.xml. # * Change the PAGES_INCLUDE_POSTS list to include any pages that are looping # through your posts (e.g. "index.html", "archive.html", etc.). This will # ensure that right after you make a new post, the last modified date will # be updated to reflect the new post. # * A sitemap.xml should be included in your _site folder. # * If there are any files you don't want included in the sitemap, add them # to the EXCLUDED_FILES list. The name should match the name of the source # file. # * If you want to include the optional changefreq and priority attributes, # simply include custom variables in the YAML Front Matter of that file. # The names of these custom variables are defined below in the # CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME and PRIORITY_CUSTOM_VARIABLE_NAME # constants. # # Notes: # * The last modified date is determined by the latest from the following: # system modified date of the page or post, system modified date of # included layout, system modified date of included layout within that # layout, ... # # Author: Michael Levin # Site: http://www.kinnetica.com # Distributed Under A Creative Commons License # - http://creativecommons.org/licenses/by/3.0/ # # Modified for Octopress by John W. Long # require 'rexml/document' require 'fileutils' module Jekyll # Change SITEMAP_FILE_NAME if you would like your sitemap file # to be called something else SITEMAP_FILE_NAME = "sitemap.xml" # Any files to exclude from being included in the sitemap.xml EXCLUDED_FILES = ["atom.xml", "404.markdown"] # Any files that include posts, so that when a new post is added, the last # modified date of these pages should take that into account PAGES_INCLUDE_POSTS = ["index.html"] # Custom variable names for changefreq and priority elements # These names are used within the YAML Front Matter of pages or posts # for which you want to include these properties CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME = "change_frequency" PRIORITY_CUSTOM_VARIABLE_NAME = "priority" class Post attr_accessor :name def full_path_to_source File.join(@base, @name) end def location_on_server "#{site.config['url']}#{url}" end end class Page attr_accessor :name def full_path_to_source File.join(@base, @dir, @name) end def location_on_server location = "#{site.config['url']}#{@dir}#{url}" location.gsub(/index.html$/, "") end end class Layout def full_path_to_source File.join(@base, @name) end end # Recover from strange exception when starting server without --auto class SitemapFile < StaticFile def write(dest) begin super(dest) rescue end true end end class SitemapGenerator < Generator # Valid values allowed by sitemap.xml spec for change frequencies VALID_CHANGE_FREQUENCY_VALUES = ["always", "hourly", "daily", "weekly", "monthly", "yearly", "never"] # Goes through pages and posts and generates sitemap.xml file # # Returns nothing def generate(site) sitemap = REXML::Document.new << REXML::XMLDecl.new("1.0", "UTF-8") urlset = REXML::Element.new "urlset" urlset.add_attribute("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9") @last_modified_post_date = fill_posts(site, urlset) fill_pages(site, urlset) sitemap.add_element(urlset) # File I/O: create sitemap.xml file and write out pretty-printed XML unless File.exists?(site.dest) FileUtils.mkdir_p(site.dest) end file = File.new(File.join(site.dest, SITEMAP_FILE_NAME), "w") formatter = REXML::Formatters::Pretty.new(4) formatter.compact = true formatter.write(sitemap, file) file.close # Keep the sitemap.xml file from being cleaned by Jekyll site.static_files << Jekyll::SitemapFile.new(site, site.dest, "/", SITEMAP_FILE_NAME) end # Create url elements for all the posts and find the date of the latest one # # Returns last_modified_date of latest post def fill_posts(site, urlset) last_modified_date = nil site.posts.each do |post| if !excluded?(post.name) url = fill_url(site, post) urlset.add_element(url) end path = post.full_path_to_source date = File.mtime(path) last_modified_date = date if last_modified_date == nil or date > last_modified_date end last_modified_date end # Create url elements for all the normal pages and find the date of the # index to use with the pagination pages # # Returns last_modified_date of index page def fill_pages(site, urlset) site.pages.each do |page| if !excluded?(page.name) path = page.full_path_to_source if File.exists?(path) url = fill_url(site, page) urlset.add_element(url) end end end end # Fill data of each URL element: location, last modified, # change frequency (optional), and priority. # # Returns url REXML::Element def fill_url(site, page_or_post) url = REXML::Element.new "url" loc = fill_location(page_or_post) url.add_element(loc) lastmod = fill_last_modified(site, page_or_post) url.add_element(lastmod) if lastmod if (page_or_post.data[CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME]) change_frequency = page_or_post.data[CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME].downcase if (valid_change_frequency?(change_frequency)) changefreq = REXML::Element.new "changefreq" changefreq.text = change_frequency url.add_element(changefreq) else puts "ERROR: Invalid Change Frequency In #{page_or_post.name}" end end if (page_or_post.data[PRIORITY_CUSTOM_VARIABLE_NAME]) priority_value = page_or_post.data[PRIORITY_CUSTOM_VARIABLE_NAME] if valid_priority?(priority_value) priority = REXML::Element.new "priority" priority.text = page_or_post.data[PRIORITY_CUSTOM_VARIABLE_NAME] url.add_element(priority) else puts "ERROR: Invalid Priority In #{page_or_post.name}" end end url end # Get URL location of page or post # # Returns the location of the page or post def fill_location(page_or_post) loc = REXML::Element.new "loc" loc.text = page_or_post.location_on_server loc end # Fill lastmod XML element with the last modified date for the page or post. # # Returns lastmod REXML::Element or nil def fill_last_modified(site, page_or_post) path = page_or_post.full_path_to_source lastmod = REXML::Element.new "lastmod" date = File.mtime(path) latest_date = find_latest_date(date, site, page_or_post) if @last_modified_post_date == nil # This is a post lastmod.text = latest_date.iso8601 else # This is a page if posts_included?(page_or_post.name) # We want to take into account the last post date final_date = greater_date(latest_date, @last_modified_post_date) lastmod.text = final_date.iso8601 else lastmod.text = latest_date.iso8601 end end lastmod end # Go through the page/post and any implemented layouts and get the latest # modified date # # Returns formatted output of latest date of page/post and any used layouts def find_latest_date(latest_date, site, page_or_post) layouts = site.layouts layout = layouts[page_or_post.data["layout"]] while layout path = layout.full_path_to_source date = File.mtime(path) latest_date = date if (date > latest_date) layout = layouts[layout.data["layout"]] end latest_date end # Which of the two dates is later # # Returns latest of two dates def greater_date(date1, date2) if (date1 >= date2) date1 else date2 end end # Is the page or post listed as something we want to exclude? # # Returns boolean def excluded?(name) EXCLUDED_FILES.include? name end def posts_included?(name) PAGES_INCLUDE_POSTS.include? name end # Is the change frequency value provided valid according to the spec # # Returns boolean def valid_change_frequency?(change_frequency) VALID_CHANGE_FREQUENCY_VALUES.include? change_frequency end # Is the priority value provided valid according to the spec # # Returns boolean def valid_priority?(priority) begin priority_val = Float(priority) return true if priority_val >= 0.0 and priority_val <= 1.0 rescue ArgumentError end false end end end