python抓取所有航空公司新闻 -

junfeng_feng

浏览: 18244 次
性别:
来自: 北京

最近访客更多访客>>

woodding2008

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

python抓取所有航空公司新闻

Python AIR 正则表达式 MySQL 多线程

#coding=utf-8

import urllib
import time
import re
import threading
import string
import urllib2
import chardet
import MySQLdb
import base64
import os
import sys
from pyquery import PyQuery
reload(sys)
sys.setdefaultencoding("utf-8")

#新闻最早时间
earliestTime = "2010-06-01"
mutex = threading.Lock()

def printf(obj):
    """多线程的输出"""
    global mutex
    mutex.acquire()
    print obj
    mutex.release()
    

def mysqlDataBase_init(database="jipiao"):
    """连接mysql
返回conn,cursor"""
    conn = MySQLdb.connect(host="localhost", user="root", passwd="", db=database, unix_socket="/opt/lampp/var/mysql/mysql.sock")
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8")
    cursor.execute("SET CHARACTER_SET_CLIENT=utf8")
    cursor.execute("SET CHARACTER_SET_RESULTS=utf8")
    conn.commit()
    return (conn, cursor)       

def countNewsUrl(cursor, newsUrl):
    "count  newsUrl 个数"
    selectSql = "select count(*) from jipiaonews where newsUrl='%s'" % newsUrl
    cursor.execute(selectSql)
    result = cursor.fetchone()
    return result[0]
    pass

def checkNewsUrl(cursor, newsUrl):
    """检查newsUrl
返回是否已经存在"""
    #已经有的url，不需要抓取了
    if countNewsUrl(cursor, newsUrl) > 0:
        return 1
    
    #多线程输出
    global mutex
    mutex.acquire()
    print newsUrl
    mutex.release()
    return 0

def executeSql(cursor, title, postDate, content, airport, source, newsUrl):
    """执行sql语句"""
    #替换不友好字符
    content = content.replace("'", "")
    content = content.replace("\"", "")
    
    insertsql = ("insert into jipiaonews (title,postDate,content,airport,source,newsUrl)" 
            " values('%s','%s','%s','%s','%s','%s')"
            % (title, postDate, content, airport, source, newsUrl))    
    insertsql = insertsql.encode("utf-8")
    cursor.execute(insertsql)    
    pass

def parseDate(postDate):
    """转换日期"""
    return postDate.replace("年", "-").replace("月", "-").replace("日", "")
    pass

def xiamenhangkong():
    """厦门航空"""
    airport = "厦门航空"
    matchPostDate=re.compile(r"[\d]+-[\d]+-[\d]")
    
    conn,cursor=mysqlDataBase_init()
    page=1
    while page!=-1:
        newsPageUrl="http://www.xiamenair.com.cn/newslist/news-%s.shtml"%page
        page+=1
        
        try:
            pq=PyQuery(newsPageUrl)
        except:
            break
        i=0
        while True:
            a= pq("table.artlist a:eq(%s)"%i)
            i+=1
            
            newsUrl=a.attr("href")
            if not newsUrl:
                #第一个新闻都没有，则退出
                if i==1:
                    page=-1
                break
            newsUrl="http://www.xiamenair.com.cn"+newsUrl
            if checkNewsUrl(cursor, newsUrl):
                page=-1
                break
            title=a.text()
            tr=pq("table.artlist tr:eq(%s)"%i)
            postDate = matchPostDate.search(tr.text()).group()
            newpq=PyQuery(newsUrl)
            content=newpq("div.artcon").text()

            if content.find("）")!=-1:
                content=content[content.find("）")+1:]
            content=content.strip() 
            source="" 
            
            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            pass
        pass
    conn.commit()
    conn.close()
#    newsBasePage = "http://www.xiamenair.com.cn/about_news.aspx?Page=%s"
#    #由于新闻少，所以时间向前推了
#    ownEarliestTime = "2010-01-01"
#    #页码
#    pages = 1
#    while pages != -1:
#        newsBaseUrl = newsBasePage % pages    
#        pages += 1
#        
#        pq = PyQuery(newsBaseUrl)
#        table = pq("div#header table:eq(25)")
#        conn, cursor = mysqlDataBase_init()
#        index = 0
#        while True:
#            a = table.find("a:eq(%s)" % index)
#            index += 1
#
#            newsUrl = a.attr("href")
#            if not newsUrl:
#                break
#            newsUrl = "http://www.xiamenair.com.cn/" + newsUrl    
#            title = a.text()
#            if checkNewsUrl(cursor, newsUrl):
#                pages=-1#结束
#                break
#            newspq = PyQuery(newsUrl)
#            newsTable = newspq("table:eq(3)")    
#            postDate = newsTable("div:eq(1)").html()  
#            #函数终止条件
#            if postDate < ownEarliestTime:
#                pages = -1#结束
#                break  
#            content = newsTable("table:eq(1)").text()
#            source = "" 
#            postDate = postDate.encode("utf8")
#            
#            executeSql(cursor, title, postDate, content, airport, source, newsUrl)           
#            pass
#        conn.commit()
#        conn.close()
#        pass    
#    pass

def chongqinghangkong():
    """重庆航空"""
    airport = "重庆航空"
    
    newsBaseUrl = "http://www.flycq.com/info/news/index.html"
    pq = PyQuery(newsBaseUrl) 
    conn, cursor = mysqlDataBase_init()
    #使用正则表达式，查找日期
    patternDate = re.compile(r"[\d]{6}")
    i = 0
    while i!=-1:
        a = pq("div#newsContent div ul li:eq(%s)" % i).find("a")
        newsUrl = a.attr("href")
        i += 1
        if not newsUrl:
            break
        if newsUrl.startswith("http"):
            continue
        #使用正则表达式，查找日期
        postDate = "20" + patternDate.search(newsUrl).group()
        #需要修改个别的网址
        newsUrl = newsUrl[newsUrl.rfind('/') + 1:]       
        try:         
            newsUrl = "http://www.flycq.com/info/news/" + newsUrl
            if checkNewsUrl(cursor, newsUrl):
                i=-1#结束
                break
            newspq = PyQuery(newsUrl)
        except urllib2.HTTPError:
            printf(newsUrl + "   illegal url!")
            continue            
        content = newspq("div#newsContent").text()
        if content.find("InstanceBeginEditable") != -1:
            content = content[len("InstanceBeginEditable name=\"content\" "):]
        if content.find("InstanceEndEditable") != -1:
            content = content[:-len("InstanceEndEditable")]

        title = a.text()
        source = ""

        executeSql(cursor, title, postDate, content, airport, source, newsUrl)   
        pass   
    conn.commit()
    conn.close()    
    pass

def hannanhangkong():
    """海南航空"""
    airport = "海南航空"
    
    matchUrl = re.compile(r"/.*\.html")
    matchPostDate = re.compile(r"[\d]{4}-[\d]+-[\d]+")
    
    page = 0
    while page != -1:
        if page == 0:
            newsPageUrl = "http://www.hnair.com/gyhh/hhxw/index.html" 
        else:
            newsPageUrl = "http://www.hnair.com/gyhh/hhxw/index_%s.html" % page
        page += 1
        
        conn, cursor = mysqlDataBase_init()
        pq = PyQuery(newsPageUrl)
        i = 0
        while True:
            a = pq("div.body a:eq(%s)" % i)
            newsUrl = a.attr("onclick")
            if not newsUrl:
                break
            newsUrl = matchUrl.search(newsUrl).group()
            newsUrl = "http://www.hnair.com/gyhh/hhxw" + newsUrl
            li = pq("div.body li:eq(%s)" % i)
            i += 1
            
            postDate = matchPostDate.search(li.text()).group()
            if postDate < earliestTime:
                page = -1#结束标记
                break
            if checkNewsUrl(cursor, newsUrl):
                page = -1#结束标记
                break
            title = li.text()[:-(len(postDate) + 2)]                    
            newspq = PyQuery(newsUrl)
            content = newspq("div.body").text()
            if content.find("本报讯") != -1 and content.find("）") != -1:
                content = content[content.find("）") + len("）"):]
            source = ""

            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            pass
        conn.commit()
        conn.close()
        pass

def xingfuhangkong():
    """幸福航空"""
    airport = "幸福航空"
    newsBasePage = "http://www.joy-air.com/AboutUs/NewsCenter/List.aspx?id=1"
    pq = PyQuery(newsBasePage)
    
    url = pq("div#newslist").find("a").attr("href") 
    id = int(re.search(r"[\d]+", url).group())
    newsBaseUrl = "http://www.joy-air.com/AboutUs/NewsCenter/Detail.aspx?id="
    
    conn, cursor = mysqlDataBase_init()
    while True:
        newsUrl = newsBaseUrl + str(id)
        id -= 1
        
        if checkNewsUrl(cursor, newsUrl):
            break                   
        try:
            newspq = PyQuery(newsUrl)
        except urllib2.HTTPError:
            continue
        postDate = newspq("div.dynamicmessagesbox span:eq(1)").text()
        #函数终止条件
        if postDate < earliestTime:        
            break 
        content = newspq("div.dynamicmessagesbox").text()
        title = newspq("div.dynamicmessagesbox span:eq(0)").text()
        source = ""
            
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        pass
    conn.commit()
    conn.close()
    pass

def shenzhenhangkong():
    """深圳航空"""
    airport = "深圳航空"
    newsPageUrl = "http://www.shenzhenair.com/more.jsp?category=423afc988259499faa94bbaf8b6983b5&title=%E6%B7%B1%E8%88%AA%E6%96%B0%E9%97%BB&lan=1" 
    
    sourceFile = urllib.urlopen(newsPageUrl).read()
    pq = PyQuery(sourceFile.decode("gbk"))
    
    conn, cursor = mysqlDataBase_init()
    newspqs = pq("table:eq(17)")
    match = re.compile(r"</script>.*")
    i = 0
    while True:
        a = newspqs("a:eq(%s)" % i)
        i += 1
        
        newsUrl = a.attr("href")
        title = a.text()
        if not newsUrl:
            break
        newsUrl = newsUrl[:newsUrl.rfind("&")]
        newsUrl = "http://www.shenzhenair.com/" + newsUrl   
        title = match.search(title).group()[len(r"</script>"):].strip()
        if checkNewsUrl(cursor, newsUrl):
            break        
        sourceFile = urllib.urlopen(newsUrl).read()
        newspq = PyQuery(sourceFile.decode("gbk"))
        content = newspq("table:eq(2)").text()
        #没有找到时间
        postDate = time.strftime("%Y%m%d")
        source = ""
        begin = content.find("】")
        begin = content.find("】", begin + 1)
        end = content.rfind("【")
        if begin == -1:
            begin = 0
        content = content[begin + 2:end]
  
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        if i % 20 == 0:
            time.sleep(5)
        pass
    conn.commit()
    conn.close()
    pass

def chunqiuhangkong():
    """春秋航空
采用的是新闻页的，比首页的新闻还少"""
    airport = "春秋航空"
    newsPageUrl = "http://www.china-sss.com/Static/New_information"

    conn, cursor = mysqlDataBase_init()
    sourceFile = urllib.urlopen(newsPageUrl).read()
    pq = PyQuery(unicode(sourceFile, "utf-8"))
    i = 1
    while True:
        span = pq("div.New_information span:eq(%s)" % i)
        a = pq("div.New_information a:eq(%s)" % i)
        i += 1

        newsUrl = a.attr("href")
        if not newsUrl:
            break 
        title = a.text()
        postDate = span.text()
        newsUrl = "http://www.china-sss.com" + newsUrl
        if checkNewsUrl(cursor, newsUrl):
            break            
        sourceFile = urllib.urlopen(newsUrl).read()
        newspq = PyQuery(unicode(sourceFile, "utf-8"))
        content = newspq("dl#content_top dd").text()
        source = ""
          
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        pass
    conn.commit()
    conn.close()
    pass

def dongfanghangkong():
    """东方航空"""
    airport = "东方航空"
    newsPageUrl = "http://www.ce-air.com/mu/main/gydh/xwgg/index.html"
    index = 0
    while index != -1:  
        if index != 0:
            newsPageUrl = ("http://www.ce-air.com/"
                         "mu/main/gydh/xwgg/index_%s.html" % index)
             
        index += 1#下一个页面    
        pq = PyQuery(newsPageUrl)
        conn, cursor = mysqlDataBase_init()
        i = 0
        while True:
            a = pq("div.s_xw_list ul li:eq(%s)" % i).find("a")
            i += 1

            href = a.attr("href")
            if not href:
                break
            title = a.text()
            newsUrl = "http://www.ce-air.com/mu/main/gydh/xwgg/" + href[2:]    
            postDate = re.search(r"[\d]{8}", newsUrl).group()
            if postDate < earliestTime.replace("-", ""):
                index = -1#外层循环结束标记
                break 
            if checkNewsUrl(cursor, newsUrl):
                index = -1#外层循环结束标记
                break                      
            newspq = PyQuery(newsUrl)
            content = newspq("div.body p:eq(1)").text()
            if content == None:
                content = newspq("div.body p:eq(0)").text()
            if len(content) == 0:
                    content = newspq("div.body").text()
                    content = content[content.rfind("}") + 1:]
                
            table = newspq("div.body table").text()           
            source = re.search("'.*'", table).group()[1:-1]
            
            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            pass
        conn.commit()
        conn.close()
        pass
    pass

def shanghaihangkong():
    """上海航空
没有新闻，只有快讯"""
    airport = "上海航空"
    newsPageUrl = "http://www.shanghai-air.com/cmsweb/sale.aspx?pageIndex=0"
    sourceFile = urllib.urlopen(newsPageUrl).read()
    pq = PyQuery(unicode(sourceFile, "utf-8"))
    
    conn, cursor = mysqlDataBase_init()
    i = 0
    while True:
        a = pq("div.items li:eq(%s)" % i).find("a")
        i += 1

        newsUrl = a.attr("href")
        title = a.attr("title")
        if not newsUrl:
            break
        newsUrl = "http://www.shanghai-air.com/cmsweb/" + newsUrl
        if checkNewsUrl(cursor, newsUrl):
            break            
        sourceFile = urllib.urlopen(newsUrl).read()
        newspq = PyQuery(unicode(sourceFile, "utf-8"))
        content = newspq("div.nowtext").text()
        if not content:
            content = ""
        postDate = time.strftime("%Y%m%d")
        source=""
        
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        pass
    conn.commit()
    conn.close()
    pass

    """上海航空旧版的新闻"""
    airport = "上海航空"
    
    newsPageUrl = "http://ww1.shanghai-air.com/news/saldt.asp?ttt="
    matchPostDate = re.compile(r"[\d]+-[\d]+-[\d]+")
    
    conn, cursor = mysqlDataBase_init()
    pq = PyQuery(newsPageUrl)
    i = 0
    while True:
        a = pq("a:eq(%s)" % i)
        i += 1
        newsUrl = a.attr("href")
        if not newsUrl:
            break
        if not newsUrl.find("PublicInfo") != -1:
            continue
        newsUrl = "http://ww1.shanghai-air.com" + newsUrl
        if checkNewsUrl(cursor, newsUrl):
            break
        title = a.text()[1:]
        postDate = parseDate(matchPostDate.search(a.text()).group())
        title = title[:-(len(postDate) + 2)]
        newspq = PyQuery(newsUrl)
        content = newspq("div.texttext").text()
        source = ""
        if not content:
            continue
        
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        pass
    conn.commit()
    conn.close()
    pass
def zhongguohangkong():
    """中国航空
有时会出现urlopen error"""
    airport = "中国航空"
    newsPageUrl = "http://ffp.airchina.com.cn/cms/ffp/jszx/xwxx/default.html"
    
    conn, cursor = mysqlDataBase_init()
    count=0
    while count!=10:
        try:    
            pq = PyQuery(newsPageUrl)
        except urllib2.URLError:
            count+=1
            time.sleep(5)
        else:
            break   
    i = 0
    while True:
        newsUrl = pq("a:eq(%s)" % i).attr("href")   
        if not newsUrl:            
            break
        span = pq("span:eq(%s)" % i).text()
        i += 1
        postDate = re.search(r"[\d]{4}-[\d]{2}-[\d]{2}", span).group()
        if postDate < earliestTime:
            break
        title = span[:-14]
        
        newsUrl = "http://ffp.airchina.com.cn" + newsUrl   
        if checkNewsUrl(cursor, newsUrl):
            break   
        count=0   
        while count!=10:
            try:    
                newspq = PyQuery(newsUrl)
            except urllib2.URLError:
                count+=1
                time.sleep(5)
            else:
                break          
        content = newspq("body").text()
        source = ""   
             
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)        
        pass
    conn.commit()
    conn.close()
    pass

def sichuanhangkong():
    """四川航空"""
    airport = "四川航空"
    newsPageUrl = "http://www.scal.com.cn/ScalB2CWeb/News/More_News.aspx?code=NC00391"
    
    pq = PyQuery(urllib.urlopen(newsPageUrl).read())
    ul = pq("ul.news_n")
    
    conn, cursor = mysqlDataBase_init()
    match = re.compile(r"[\d]{4}")
    i = 0
    while True:
        a = pq("ul.news_n a:eq(%s)" % i)
        i += 1
        id = a.attr("onclick")
        if not id:
            break
        
        id = match.search(id).group()
        newsUrl = "http://www.scal.com.cn/Scal.WebMaster/FileUpLoad/htmlpage/%s.html"
        newsUrl = newsUrl % id  
        if checkNewsUrl(cursor, newsUrl):
            break           
        newspq = PyQuery(newsUrl)
        title = newspq("span.NewsTitle").text()
        postDate = newspq("span.NewsInfo_Time").text()
        postDate = parseDate(postDate)[5:]
        source = newspq("span.NewsInfo_Publisher").text()
        content = newspq("div.NewsContent").text()
        
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        pass
    conn.commit()
    conn.close()
    pass
    
def tianjinhangkong():
    """天津航空"""
    airport = "天津航空"
    
    #天津航空官网上的的新闻
    url = "http://www.tianjin-air.com/index/ann!doAjaxQuery.action"
    pq = PyQuery(urllib.urlopen(url).read())
    match = re.compile("'.*'")
    conn, cursor = mysqlDataBase_init()
    i = 0
    while True:
        a = pq("a:eq(%s)" % i)
        i += 1
        title = a.attr("title")
        newsUrl = a.attr("href")
        if not newsUrl:
            break
        
        newsUrl = match.search(newsUrl).group()[1:-1]
        newsUrl = "http://www.tianjin-air.com" + newsUrl
        if checkNewsUrl(cursor, newsUrl):
            break             
        content = urllib.urlopen(newsUrl).read()
        begin = content.find("Date:")
        postDate = content[begin:content.find("\n", begin)][6:-6].strip()
        try:
            postDate = time.strftime("%Y-%m-%d", time.strptime(postDate, "%a, %m %b %Y %H:%M:%S"))
        except:
            matchp=re.search(r"[\d]+-[\d]+-[\d]+", postDate)
            if matchp:
                postDate=matchp.group()
            else:
                postDate=time.strftime("%Y%m%d")
        
        content = content[content.find("\n", content.find("X-MimeOLE")):].strip() 
        #若是base64需要解码
        content = base64.decodestring(content)  
        try:
            newspq = PyQuery(content)
        except:
            continue
        content = newspq("body").text()
        if not content:
            continue
        source=""
        
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        pass
    conn.commit()
    conn.close()
    pass
def shandonghangkong():
    """山东航空"""
    airport = "山东航空"
    
    headerString = """Accept:*/*
Accept-Charset:GBK,utf-8;q=0.7,*;q=0.3
Accept-Encoding:gzip,deflate,sdch
Accept-Language:zh-CN,zh;q=0.8
Connection:keep-alive
Content-Length:176
Content-Type:application/x-www-form-urlencoded
Cookie:JSESSIONID=0000Dva3t-EvL-J6jQ5uEa8YppU:-1; GUEST_LANGUAGE_ID=zh_CN; COOKIE_SUPPORT=true; __ozlvd671=1302764071; __ozlvd=1302764071
Host:www.shandongair.com
Method:POST /c/portal/render_portlet HTTP/1.1
Origin:http://www.shandongair.com
Referer:http://www.shandongair.com/web/shair_zh/news
User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16"""
    headerString = headerString.split("\n")

    dataString = """p_l_id:89978
p_p_id:journal_news_INSTANCE_1h76
p_p_action:0
p_p_state:normal
p_p_mode:view
p_p_col_id:column-3
p_p_col_pos:2
p_p_col_count:3
currentURL:/web/shair_zh/news"""  
    dataString = dataString.split("\n")

    dataDict = {}
    for line in dataString:
        line = line.split(":")
        dataDict[line[0]] = line[1]
    
    
    url = "http://www.shandongair.com/c/portal/render_portlet"
    request = urllib2.Request(url)
    for line in headerString:
        line = line.strip().split(":")
        request.add_header(line[0], line[1])  
        
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) 
    
    page = 1
    while page != -1:
        dataDict["cur"] = "%s" % page
        data = urllib.urlencode(dataDict)
        page += 1
        
        sourceFile = opener.open(request, data).read()    
        conn, cursor = mysqlDataBase_init()
        pq = PyQuery(unicode(sourceFile, "utf8"))
        i = 0
        while True:
            a = pq("table.taglib-search-iterator a:eq(%s)" % i)
            span = pq("table.taglib-search-iterator span:eq(%s)" % i)
            i += 1
            
            newsUrl = a.attr("href")
            if not newsUrl:
                break
            newsUrl = "http://www.shandongair.com" + newsUrl
            if checkNewsUrl(cursor, newsUrl):
                page = -1#终止条件
                break
            title = a.text()
            postDate = span.text()
            if postDate < earliestTime:
                page = -1#终止条件
                break
            sourceFile = urllib.urlopen(newsUrl).read()
            newspq = PyQuery(unicode(sourceFile, "utf8"))
            content = newspq("td.main").text()
            source = ""
            
            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            pass
        conn.commit()
        conn.close()        
        pass 


    #山东航空的首页新闻
    newsPageUrl = "http://www.shandongair.com/"
    conn, cursor = mysqlDataBase_init()
    pq = PyQuery(newsPageUrl)
    i = 0
    while True:
        a = pq("div.ind_news_box_center a:eq(%s)" % i)
        i += 1
        newsUrl = a.attr("href")
        if not newsUrl:
            break
        if checkNewsUrl(cursor, newsUrl):
            break 
        sourceFile = urllib.urlopen(newsUrl).read()
        sourceFile = unicode(sourceFile, "utf-8")
        newspq = PyQuery(sourceFile)
        content = newspq("body table:eq(1)").text()[:-6]
        title = newspq("title").text()
        
        m = re.search(r"[\d]{4}年[\d]+月[\d]+日", content)
        if m:
            postDate = m.group()
            postDate = parseDate(postDate)
        else:
            postDate = time.strftime("%Y%m%d")
        source = ""
    
        executeSql(cursor, title, postDate, content, airport, source, newsUrl)
        pass 
    conn.commit()
    conn.close()
    pass

def jixianghangkong():
    """吉祥航空
没有看见新闻，就找了促销信息"""

    airport = "吉祥航空" 
    page = 1
    while page != -1:
        newsPageUrl = "http://www.juneyaoair.com/JY_Airlines_dtlist.jsp?page=%s&menu_no=203" % page
        page += 1
        
        try:
            pq = PyQuery(newsPageUrl) 
        except:
            break
        conn, cursor = mysqlDataBase_init()
        table = pq("table:eq(18)")
        trindex = 1
        while True:
            tr = table("tr:eq(%s)" % trindex)
            trindex += 1
            if not tr:
                break          
            a = tr("td a")
            newsUrl = "http://www.juneyaoair.com/" + a.attr("href")
            
            title = a.text()
            source = tr("td:eq(3)").text()
            postDate = tr("td:eq(4)").text()
            if not postDate:
                break
            if postDate < earliestTime:
                page = -1#终止条件
                break
            if checkNewsUrl(cursor, newsUrl):
                page = -1#终止条件
                break           
            newspq = PyQuery(newsUrl)
            content = newspq("table:eq(6)").text()

            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            pass
        conn.commit()
        conn.close()
        pass
    pass

def nanfanghangkong():
    """南方航空
还有一定的问题"""
    airport = "南方航空"
    
    year = int(time.strftime("%Y"))
    match = re.compile(r"[\d]{4}-[\d]{2}-[\d]{2}")
    
    conn, cursor = mysqlDataBase_init()
    while str(year) >= earliestTime[:4]:
        newsBasePage = "http://www.csair.com/cn/aboutcsn/04/newsCompany/%s/list_news_%s.asp" % (year, year)
        sourceFile = urllib.urlopen(newsBasePage).read()
        pq = PyQuery(unicode(sourceFile, "utf8"))
        liindex = 0
        while True:
            li = pq("li:eq(%s)" % liindex)
            liindex += 1
            a = li("a")
            newsUrl = a.attr("href")
            if not newsUrl:
                break
            newsUrl = "http://www.csair.com" + newsUrl    
            postDate = match.search(li.text()).group()
            if postDate < earliestTime:
                return
            if checkNewsUrl(cursor, newsUrl):
                return             
            newspq = PyQuery(newsUrl) 
            content = newspq("#content").text()       
            title = content.split()[2]
            source = ""    
            if content.find("InstanceBeginEditable") != -1:
                content = content[len("InstanceBeginEditable name=\"con\" "):]
            if content.find("InstanceEndEditable") != -1:
                content = content[:-len("InstanceEndEditable")]

            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            pass
        year -= 1
        pass
    conn.commit()
    conn.close()
    pass

def chunqiuhangkongwang():
    """春秋航空网"""
    airport = "春秋航空"
    source = "春秋航空网"
    newsBasePageUrl = "http://info.china-sss.com/article_ps.do?xsl=topic_first_ilist_e&pt=1&ps=30&pi=%s&sf=seq_num&sfdt=number&st=ascending&cid=8069761218&bid=1&qtopic=3&qtit=&qct1=&qct2=#"

    conn, cursor = mysqlDataBase_init()
    sourceMatch = re.compile(r"来源：.*\[")
    page = 1
    while page != -1:
        newsPageUrl = newsBasePageUrl % page       
        page += 1
        
        sourceFile = urllib.urlopen(newsPageUrl).read() 
        pq = PyQuery(unicode(sourceFile, "gbk"))
        i = 0
        while True:
            a = pq("a:eq(%s)" % i)
            tr = pq("td#container1 tr:eq(%s)" % i)
            i += 1
            
            newsUrl = a.attr("href")
            if not newsUrl:
                break
            if len(newsUrl) == 1:
                continue    
            title = a.text()       
            newsUrl = "http://info.china-sss.com" + newsUrl
            postDate = "20" + tr("td:eq(2)").text()
            if postDate < earliestTime:
                page = -1
                break    
            if checkNewsUrl(cursor, newsUrl):
                page = -1
                break  
            sourceFile = urllib.urlopen(newsUrl).read()
            newspq = PyQuery(unicode(sourceFile, "gbk"))
            content = newspq("td.fontmain1").text()
            if content.find("function showContent()") != -1:
                content = content[:content.find("function showContent()")]
            
            sourceInContent = sourceMatch.search(content)
            if sourceInContent:
                source = sourceInContent.group()[len("来源："):-1]
            if source.find("】") != -1:
                source = source[:-len("】")]
            content = content.replace("[/链接]", "")
            if content.find("[链接 地址=") != -1:
                content = content[:-content.find("[链接 地址=")]        
            
            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            #恢复默认
            source = "春秋航空网"
            pass 
        pass
    conn.commit()
    conn.close()
    pass


def minhangziyuanwang():
    source = "民航资源网"
    matchPostDate = re.compile(r"[\d]+-[\d]+-[\d]+")
    conn, cursor = mysqlDataBase_init()
    
    au = ["天津航空", "吉祥航空", "厦门航空", "四川航空", "中国航空"]

    for airport in au:
        url = "http://news.carnoc.com/search.jsp?key=%s&querytype=0&page=1"    
        url = url % (urllib.quote(airport.decode("utf8").encode("gb2312")))

        sourceFile = urllib.urlopen(url).read()
        pq = PyQuery(sourceFile)
        index = 0
        while True:
            li = pq("div.text li:eq(%s)" % index)
            index += 1
            if not li:
                break
            newsUrl = li("a").attr("href")

            title = li("a").text()
            match = matchPostDate.search(li.text())
            if not match:
                continue
            postDate = match.group()
            if newsUrl.find("2000") != -1:
                continue
            if checkNewsUrl(cursor, newsUrl):
                break
            
            sourceFile = urllib.urlopen(newsUrl).read()
            newspq = PyQuery(sourceFile)
            content = newspq("div#newstext").text()
            
            if not content:
                continue
            if len(content) < 100:
                continue
            executeSql(cursor, title, postDate, content, airport, source, newsUrl)
            pass 
    
    #恢复默认   
    source = ""
    conn.commit()
    conn.close()
    pass

def crawl_news():
    startTime = time.time()
    #最总版本使用的
    import jipiao
    threads = []
    #添加线程
    threads.append(threading.Thread(target=minhangziyuanwang))
    for f in dir(jipiao):
        if f.find("hangkong") != -1:
            exec("threads.append(threading.Thread(target=%s))" % f)
    #启动线程
    for t in threads:
        t.start()
    #等待线程结束
    for t in threads:
        t.join()
        
    printf((time.time() - startTime))    
    pass

if __name__ == "__main__":
    startTime = time.time()
    crawl_news()
    
    #写日志
    if not os.path.exists("logs"):
        os.mkdir("logs")
    logs = file("./logs/logs.txt", "a")
    now = time.strftime("%Y-%m-%d %H:%M:%S")
    logs.write("%s\t耗时:%sms\n" % (now, time.time() - startTime))
    logs.close()
    
    printf(time.time() - startTime)

#    chongqinghangkong()
#    chunqiuhangkong()
#    chunqiuhangkongwang()#春秋航空网
#    dongfanghangkong()
#    hannanhangkong()
#    jixianghangkong()
#    nanfanghangkong()
#    shandonghangkong()#ajax不易处理,使用post
#    shanghaihangkong()
#    shanghaihangkong_old()#上海航空旧版
#    shenzhenhangkong()
#    sichuanhangkong()
#    tianjinhangkong()
#    xiamenhangkong()
#    xingfuhangkong()
#    zhongguohangkong()
#    minhangziyuanwang()

创建数据的SQL

CREATE TABLE IF NOT EXISTS `jipiaonews` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` text NOT NULL,
  `postDate` datetime NOT NULL,
  `content` text NOT NULL,
  `airport` varchar(255) NOT NULL,
  `source` varchar(255) NOT NULL,
  `newsUrl` varchar(255) NOT NULL DEFAULT '',
  PRIMARY KEY (`id`)
) ENGINE=MyISAM  DEFAULT CHARSET=utf8;

分享到：