将一个网络链接保存为mht格式的文件
import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
class MHTHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
if not tag in ['link','script']:
return
attrs = dict(attrs)
if 'src' in attrs.keys():
a = attrs.get('src')
if a and a.find('google') == -1:
self.urls.append(a)
elif 'stylesheet' in attrs.values():
self.urls.append(attrs.get('href'))
class URL2MHT(object):
def __init__(self,url):
self.domain = url.split(urlparse(url).path)[0]
self.url = url
def _head(self):
a = email.message.Message()
a["MIME-Version"] = "1.0"
a["X-UnMHT-Save-State"] = "Current-State"
a.add_header("Content-Type",
"multipart/related",
type="text/html",
boundary="----=_Part_7C84B8F2_5B84C39F.150DBE9AC97")
return a
def mht(self):
content = urllib2.urlopen(self.url).read()
pmht = MHTHTMLParser()
pmht.feed(content)
pmht.close()
head = self._head()
head.attach(self._add(self.url))
for url in pmht.urls:
head.attach(self._add(url))
return head
def _add(self, url):
m = email.message.Message()
content = None
local_url = None
try:
content = urllib2.urlopen(url)
local_url = url
except:
local_url = self.domain+url
content = urllib2.urlopen(local_url)
content_type = content.headers.dict.get('content-type')
content = content.read()
if content_type and content_type.startswith("text/"):
m["Content-Transfer-Encoding"] = "quoted-printable"
m.set_payload(quopri.encodestring(content).decode("ascii"))
else:
m["Content-Transfer-Encoding"] = "base64"
m.set_payload(base64.b64encode(content).decode("ascii"))
m["Content-Location"] = local_url
m["Content-Type"] = content_type
return m
url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
print URL2MHT(url).mht()
# encoding=utf-8
import base64
import email
import email.message
import mimetypes
import os
import quopri
import sys
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
class MHTHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
if not tag in ['link']: # , 'script'
return
attrs = dict(attrs)
a = attrs.get('src')
if a and a.find('google') == -1:
self.urls.append((a, attrs.get('type', 'text/javascript')))
elif attrs.get('rel') == 'stylesheet':
self.urls.append(
(attrs.get('href'), attrs.get('type', 'text/css')))
class URL2MHT(object):
def __init__(self, url):
uparse = urlparse(url)
self.domain = uparse.scheme + "://" + uparse.netloc
self.url = url
self.header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
def _head(self):
a = email.message.Message()
a["MIME-Version"] = "1.0"
a["X-UnMHT-Save-State"] = "Current-State"
a.add_header("Content-Type",
"multipart/related",
type="text/html")
return a
def mht(self):
content = urllib2.urlopen(
urllib2.Request(self.url, None, self.header)).read()
pmht = MHTHTMLParser()
pmht.feed(content)
pmht.close()
head = self._head()
head.attach(self._add(self.url, utype='text/html'))
for url, utype in pmht.urls:
head.attach(self._add(url, utype))
return head
def _add(self, url, utype=None):
m = email.message.Message()
content = None
local_url = None
if not urlparse(url).netloc:
local_url = self.domain + url
else:
local_url = url
ctn = None
ecd = None
content = urllib2.urlopen(
urllib2.Request(local_url, None, self.header)).read()
if utype and utype.startswith("text/"):
ecd = "quoted-printable"
ctn = quopri.encodestring(content)
else:
ecd = "base64"
ctn = base64.b64encode(content)
m["Content-Transfer-Encoding"] = ecd
m["Content-Location"] = local_url
m["Content-Type"] = utype
m.set_payload(ctn)
return m
# url = 'http://www.cnblogs.com/weixliu/p/3554868.html'
url = 'http://blog.csdn.net/zhaoyl03/article/details/8631645'
# a = URL2MHT(url).mht().as_string(unixfrom=False)
# print a
# import codecs
# fh = codecs.open("hello.mht", mode="wb", encoding="utf-8")
# fh.write(a)
# fh.close()
x = open('hello.mht').read()
print type(x)
print chardet.detect(x)
x = x.decode('utf-8')
print type(x)
print chardet.detect(x)