标签:,, 发表于 Python开发 分类. 3条评论

涨工资是张佳玮的别名.

张佳玮是国内比较大的篮球论坛Hoopchina上的知名写手.

从06年开始粉涨工资的文章, 一直想着要写脚本把他在Hoopchina上的文章全部爬下来, 放进手机看.

没有特别认真去做善后处理, 里面bug估计不少, 由于我的要求是凑活着能用, 因此就不再改来改去了…

#!/usr/bin/env python
#coding=utf-8
"""
Author:         Xia Kai <xiaket@gmail.com>
Filename:       get_zjw.py
Type:           Utility
Last modified: 2009-11-19 11:42

Description:
This script would retrieve all posts by Zhang Jiangwei, a famous
author on a basketball forum in China.

Salute to Zhang Jiawei! Thank you for all those wonder posts.
"""
from math import ceil
from os import access, F_OK
from string import Template
from urllib import urlopen
from re import sub, compile

BASE_URL = "http://my.hoopchina.com/zhangjiawei/blog"
CACHE_FILENAME = Template('${post_number}.cache')

def get_post_number():
    """
    This function would get the number of posts by ZJW.
    """
    print "Retrieving POST Number"
    home_page = urlopen(BASE_URL)
    lines = home_page.readlines()
    for line in lines:
        line = line.decode("GB2312", "ignore").encode("UTF-8")
        if line.startswith('<div class="page">'):
            start = line.find("共") + 3
            end = line.find("篇日志")
            return int(line[start:end])

def get_post_list_from_url(url):
    """
    For every url given as parameter, there are several pages,
    we shall get all the post links relative to the url.
    """
    print "Generating POST list from %s." % url
    list_file = urlopen(url)
    lines = list_file.readlines()
    is_critical = False
    post_list = []
    for line in lines:
        line = line.decode("gb2312", "ignore").encode("utf8").lstrip()
        if is_critical:
            # This line contain critical information about the post.
            post_dict = {}
            link_start = line.index('href="') + 6
            link_end = line.index('"', link_start)
            title_start = line.index('title="') + 7
            title_end = line.index('"', title_start)
            date_start = line.index('</a>', title_end) + 4
            date_end = line.index("</p>", date_start)
            link = line[link_start:link_end]
            title = line[title_start:title_end]
            date = line[date_start:date_end]
            post_dict["title"] = title.replace('/', "/")
            post_dict["link"] = link
            post_dict["date"] = date
            is_critical = False
            post_list.append(post_dict)
        elif line.startswith('<div class="log_con">'):
            is_critical = True
    return post_list

def caching_post(post_number, POSTS):
    """
    This function would write the list of dictionaries POSTS to a file.
    """
    filename = CACHE_FILENAME.substitute(post_number = post_number)
    file = open(filename, 'w')
    print "writing cache file."
    for post_dict in POSTS:
        file.write("%s$$$%s$$$%s\n" % (
                post_dict['title'],
                post_dict['date'],
                post_dict['link'],
            )
        )

def reading_cache(post_number):
    """
    This function would try to read POSTS cache from file.
    If cache exist, we shall return 0 and the POSTS, or else
    we shall return 1 and an empty list.
    """
    POSTS = []
    filename = CACHE_FILENAME.substitute(post_number = post_number)
    if access(filename, F_OK):
        # cache file exist. Read it.
        print "Reading cached POSTS list."
        file = open(filename, 'r')
        lines = file.readlines()
        for line in lines:
            line_list = line.split("$$$")
            post_dict = {}
            post_dict["title"] = line_list[0]
            post_dict["date"] = line_list[1]
            post_dict["link"] = line_list[2]
            POSTS.append(post_dict)
        return 0, POSTS
    else:
        return 1, POSTS

def write_post(url, file):
    """
    This function would extract the content of a hoopchina webpage.
    """
    home_page = urlopen(url)
    lines = home_page.readlines()
    for index, line in enumerate(lines):
        line = line.decode("GB2312", "ignore").encode("UTF-8").strip()
        if line.startswith('<div class="title">'):
            content_start = index
        elif line.find('点此关注他/她的动态') != -1:
            content_end = index
    for line in lines[content_start:content_end]:
        line = line.decode("GB2312", "ignore").encode("UTF-8").strip()
        line = line.replace("<BR>", "\r\n").replace("</BR>", "\r\n")
        line = line.replace("<br />", "\r\n").replace("<br>", "\r\n")
        line = line.replace("&nbsp;", "").replace("</div>", "")
        html_tag = compile(r'<[^>]+>')
        line = sub(html_tag, "", line)
        file.write(line + "\r\n")
    return 0

def main():
    """
    This file would grab all posts by Zhang Jiawei and
    save them into individual files.
    """
    post_number = get_post_number()
    print "Would generate %s posts." % post_number
    status, POSTS = reading_cache(post_number)
    if status == 1:
        post_pages = int(ceil(post_number / 10.0))
        for index in range(1, post_pages + 1):
            url = BASE_URL + "-%s" % index
            post_list = get_post_list_from_url(url)
            POSTS += post_list
            # Write POSTS to a file so it may be reused.
        caching_post(post_number, POSTS)
        print "Done getting post list."
    else:
        print "Using cached post list."

    for post_dict in POSTS:
        filename = "%s__%s.txt" % (post_dict['date'], post_dict['title'])
        if access(filename, F_OK):
            print "File %s exist, skipping." % filename
            continue
        file = open(filename, 'w')
        print "Writing post: %s" % filename
        status = write_post(post_dict['link'], file)
        file.close()
        if status != 0:
            import sys
            sys.exit(0)

if __name__ == "__main__":
    main()
2009-11-22 19:37