将一个网络链接保存为mht格式的文件

import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse

class MHTHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.urls = []

    def handle_starttag(self, tag, attrs):
        if not tag in ['link','script']:
            return
        attrs = dict(attrs)
        if 'src' in attrs.keys():
            a = attrs.get('src')
            if a and a.find('google') == -1:
                self.urls.append(a)
        elif 'stylesheet' in attrs.values():
            self.urls.append(attrs.get('href'))

class URL2MHT(object):
    def __init__(self,url):
        self.domain = url.split(urlparse(url).path)[0]
        self.url = url

    def _head(self):
        a = email.message.Message()
        a["MIME-Version"] = "1.0"
        a["X-UnMHT-Save-State"] = "Current-State"
        a.add_header("Content-Type",
                     "multipart/related",
                     type="text/html",
                     boundary="----=_Part_7C84B8F2_5B84C39F.150DBE9AC97")
        return a

    def mht(self):
        content = urllib2.urlopen(self.url).read()
        pmht = MHTHTMLParser()
        pmht.feed(content)
        pmht.close()

        head = self._head()
        head.attach(self._add(self.url))

        for url in pmht.urls:
            head.attach(self._add(url))
        return head

    def _add(self, url):
        m = email.message.Message()
        content = None
        local_url = None
        try:
            content = urllib2.urlopen(url)
            local_url = url
        except:
            local_url = self.domain+url
            content = urllib2.urlopen(local_url)
        content_type = content.headers.dict.get('content-type')
        content = content.read()
        if content_type and content_type.startswith("text/"):
            m["Content-Transfer-Encoding"] = "quoted-printable"
            m.set_payload(quopri.encodestring(content).decode("ascii"))
        else:
            m["Content-Transfer-Encoding"] = "base64"
            m.set_payload(base64.b64encode(content).decode("ascii"))
        m["Content-Location"] = local_url
        m["Content-Type"] = content_type
        return m

url = 'http://www.cnblogs.com/weixliu/p/3554868.html'    
print URL2MHT(url).mht()



# encoding=utf-8

import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
import chardet

reload(sys)
sys.setdefaultencoding('utf-8')


class MHTHTMLParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.urls = []

    def handle_starttag(self, tag, attrs):
        if not tag in ['link']:  # , 'script'
            return
        attrs = dict(attrs)
        a = attrs.get('src')
        if a and a.find('google') == -1:
            self.urls.append((a, attrs.get('type', 'text/javascript')))
        elif attrs.get('rel') == 'stylesheet':
            self.urls.append(
                (attrs.get('href'), attrs.get('type', 'text/css')))


class URL2MHT(object):

    def __init__(self, url):
        uparse = urlparse(url)
        self.domain = uparse.scheme + "://" + uparse.netloc
        self.url = url
        self.header = {
            'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}

    def _head(self):
        a = email.message.Message()
        a["MIME-Version"] = "1.0"
        a["X-UnMHT-Save-State"] = "Current-State"
        a.add_header("Content-Type",
                     "multipart/related",
                     type="text/html")
        return a

    def mht(self):
        content = urllib2.urlopen(
            urllib2.Request(self.url, None, self.header)).read()

        pmht = MHTHTMLParser()
        pmht.feed(content)
        pmht.close()

        head = self._head()
        head.attach(self._add(self.url, utype='text/html'))

        for url, utype in pmht.urls:
            head.attach(self._add(url, utype))
        return head

    def _add(self, url, utype=None):
        m = email.message.Message()
        content = None
        local_url = None
        if not urlparse(url).netloc:
            local_url = self.domain + url
        else:
            local_url = url

        ctn = None
        ecd = None
        content = urllib2.urlopen(
            urllib2.Request(local_url, None, self.header)).read()

        if utype and utype.startswith("text/"):
            ecd = "quoted-printable"
            ctn = quopri.encodestring(content)
        else:
            ecd = "base64"
            ctn = base64.b64encode(content)

        m["Content-Transfer-Encoding"] = ecd
        m["Content-Location"] = local_url
        m["Content-Type"] = utype
        m.set_payload(ctn)
        return m


# url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
url = 'http://blog.csdn.net/zhaoyl03/article/details/8631645'
# a = URL2MHT(url).mht().as_string(unixfrom=False)
# print a
# import codecs
# fh = codecs.open("hello.mht", mode="wb", encoding="utf-8")
# fh.write(a)
# fh.close()

x = open('hello.mht').read()
print type(x)
print chardet.detect(x)
x = x.decode('utf-8')

print type(x)
print chardet.detect(x)
百里求一的博客

观察，思考，学习

将一个网络链接保存为mht格式的文件