<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>年华转瞬 &#187; 张佳玮</title>
	<atom:link href="http://blog.xiaket.org/tag/%e5%bc%a0%e4%bd%b3%e7%8e%ae/feed/" rel="self" type="application/rss+xml" />
	<link>http://blog.xiaket.org</link>
	<description>xiaket 的网志</description>
	<lastBuildDate>Sat, 21 Aug 2010 02:31:27 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.0.1</generator>
		<item>
		<title>抓涨工资文章的脚本</title>
		<link>http://blog.xiaket.org/2009/11/22/python-spider-zhangjiawei/</link>
		<comments>http://blog.xiaket.org/2009/11/22/python-spider-zhangjiawei/#comments</comments>
		<pubDate>Sun, 22 Nov 2009 11:37:14 +0000</pubDate>
		<dc:creator>xiaket</dc:creator>
				<category><![CDATA[Python开发]]></category>
		<category><![CDATA[张佳玮]]></category>
		<category><![CDATA[篮球]]></category>

		<guid isPermaLink="false">http://blog.xiaket.org/?p=104</guid>
		<description><![CDATA[涨工资是张佳玮的别名.

张佳玮是国内比较大的篮球论坛Hoopchina上的知名写手.

从06年开始粉涨工资的文章, 一直想着要写脚本把他在Hoopchina上的文章全部爬下来, 放进手机看.

<span class="readmore"><a href="http://blog.xiaket.org/2009/11/22/python-spider-zhangjiawei/" title="抓涨工资文章的脚本">阅读全文——共4206字</a></span>]]></description>
			<content:encoded><![CDATA[<p>涨工资是张佳玮的别名.</p>
<p>张佳玮是国内比较大的篮球论坛<a href="http://www.hoopchina.com">Hoopchina</a>上的知名写手.</p>
<p>从06年开始粉涨工资的文章, 一直想着要写脚本把他在Hoopchina上的文章全部爬下来, 放进手机看.</p>
<p>没有特别认真去做善后处理, 里面bug估计不少, 由于我的要求是凑活着能用, 因此就不再改来改去了&#8230;</p>
<pre class="brush: python;">
#!/usr/bin/env python
#coding=utf-8
&quot;&quot;&quot;
Author:         Xia Kai &lt;xiaket@gmail.com&gt;
Filename:       get_zjw.py
Type:           Utility
Last modified: 2009-11-19 11:42

Description:
This script would retrieve all posts by Zhang Jiangwei, a famous
author on a basketball forum in China.

Salute to Zhang Jiawei! Thank you for all those wonder posts.
&quot;&quot;&quot;
from math import ceil
from os import access, F_OK
from string import Template
from urllib import urlopen
from re import sub, compile

BASE_URL = &quot;http://my.hoopchina.com/zhangjiawei/blog&quot;
CACHE_FILENAME = Template('${post_number}.cache')

def get_post_number():
    &quot;&quot;&quot;
    This function would get the number of posts by ZJW.
    &quot;&quot;&quot;
    print &quot;Retrieving POST Number&quot;
    home_page = urlopen(BASE_URL)
    lines = home_page.readlines()
    for line in lines:
        line = line.decode(&quot;GB2312&quot;, &quot;ignore&quot;).encode(&quot;UTF-8&quot;)
        if line.startswith('&lt;div class=&quot;page&quot;&gt;'):
            start = line.find(&quot;共&quot;) + 3
            end = line.find(&quot;篇日志&quot;)
            return int(line[start:end])

def get_post_list_from_url(url):
    &quot;&quot;&quot;
    For every url given as parameter, there are several pages,
    we shall get all the post links relative to the url.
    &quot;&quot;&quot;
    print &quot;Generating POST list from %s.&quot; % url
    list_file = urlopen(url)
    lines = list_file.readlines()
    is_critical = False
    post_list = []
    for line in lines:
        line = line.decode(&quot;gb2312&quot;, &quot;ignore&quot;).encode(&quot;utf8&quot;).lstrip()
        if is_critical:
            # This line contain critical information about the post.
            post_dict = {}
            link_start = line.index('href=&quot;') + 6
            link_end = line.index('&quot;', link_start)
            title_start = line.index('title=&quot;') + 7
            title_end = line.index('&quot;', title_start)
            date_start = line.index('&lt;/a&gt;', title_end) + 4
            date_end = line.index(&quot;&lt;/p&gt;&quot;, date_start)
            link = line[link_start:link_end]
            title = line[title_start:title_end]
            date = line[date_start:date_end]
            post_dict[&quot;title&quot;] = title.replace('/', &quot;／&quot;)
            post_dict[&quot;link&quot;] = link
            post_dict[&quot;date&quot;] = date
            is_critical = False
            post_list.append(post_dict)
        elif line.startswith('&lt;div class=&quot;log_con&quot;&gt;'):
            is_critical = True
    return post_list

def caching_post(post_number, POSTS):
    &quot;&quot;&quot;
    This function would write the list of dictionaries POSTS to a file.
    &quot;&quot;&quot;
    filename = CACHE_FILENAME.substitute(post_number = post_number)
    file = open(filename, 'w')
    print &quot;writing cache file.&quot;
    for post_dict in POSTS:
        file.write(&quot;%s$$$%s$$$%s\n&quot; % (
                post_dict['title'],
                post_dict['date'],
                post_dict['link'],
            )
        )

def reading_cache(post_number):
    &quot;&quot;&quot;
    This function would try to read POSTS cache from file.
    If cache exist, we shall return 0 and the POSTS, or else
    we shall return 1 and an empty list.
    &quot;&quot;&quot;
    POSTS = []
    filename = CACHE_FILENAME.substitute(post_number = post_number)
    if access(filename, F_OK):
        # cache file exist. Read it.
        print &quot;Reading cached POSTS list.&quot;
        file = open(filename, 'r')
        lines = file.readlines()
        for line in lines:
            line_list = line.split(&quot;$$$&quot;)
            post_dict = {}
            post_dict[&quot;title&quot;] = line_list[0]
            post_dict[&quot;date&quot;] = line_list[1]
            post_dict[&quot;link&quot;] = line_list[2]
            POSTS.append(post_dict)
        return 0, POSTS
    else:
        return 1, POSTS

def write_post(url, file):
    &quot;&quot;&quot;
    This function would extract the content of a hoopchina webpage.
    &quot;&quot;&quot;
    home_page = urlopen(url)
    lines = home_page.readlines()
    for index, line in enumerate(lines):
        line = line.decode(&quot;GB2312&quot;, &quot;ignore&quot;).encode(&quot;UTF-8&quot;).strip()
        if line.startswith('&lt;div class=&quot;title&quot;&gt;'):
            content_start = index
        elif line.find('点此关注他/她的动态') != -1:
            content_end = index
    for line in lines[content_start:content_end]:
        line = line.decode(&quot;GB2312&quot;, &quot;ignore&quot;).encode(&quot;UTF-8&quot;).strip()
        line = line.replace(&quot;&lt;BR&gt;&quot;, &quot;\r\n&quot;).replace(&quot;&lt;/BR&gt;&quot;, &quot;\r\n&quot;)
        line = line.replace(&quot;&lt;br /&gt;&quot;, &quot;\r\n&quot;).replace(&quot;&lt;br&gt;&quot;, &quot;\r\n&quot;)
        line = line.replace(&quot;&amp;nbsp;&quot;, &quot;&quot;).replace(&quot;&lt;/div&gt;&quot;, &quot;&quot;)
        html_tag = compile(r'&lt;[^&gt;]+&gt;')
        line = sub(html_tag, &quot;&quot;, line)
        file.write(line + &quot;\r\n&quot;)
    return 0

def main():
    &quot;&quot;&quot;
    This file would grab all posts by Zhang Jiawei and
    save them into individual files.
    &quot;&quot;&quot;
    post_number = get_post_number()
    print &quot;Would generate %s posts.&quot; % post_number
    status, POSTS = reading_cache(post_number)
    if status == 1:
        post_pages = int(ceil(post_number / 10.0))
        for index in range(1, post_pages + 1):
            url = BASE_URL + &quot;-%s&quot; % index
            post_list = get_post_list_from_url(url)
            POSTS += post_list
            # Write POSTS to a file so it may be reused.
        caching_post(post_number, POSTS)
        print &quot;Done getting post list.&quot;
    else:
        print &quot;Using cached post list.&quot;

    for post_dict in POSTS:
        filename = &quot;%s__%s.txt&quot; % (post_dict['date'], post_dict['title'])
        if access(filename, F_OK):
            print &quot;File %s exist, skipping.&quot; % filename
            continue
        file = open(filename, 'w')
        print &quot;Writing post: %s&quot; % filename
        status = write_post(post_dict['link'], file)
        file.close()
        if status != 0:
            import sys
            sys.exit(0)

if __name__ == &quot;__main__&quot;:
    main()
</pre>
]]></content:encoded>
			<wfw:commentRss>http://blog.xiaket.org/2009/11/22/python-spider-zhangjiawei/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
		</item>
	</channel>
</rss>
