潇湘竹溪收藏的代码列表 - ITeye技术网站

`

潇湘竹溪

浏览: 446 次
性别:
来自: 广州

最近访客更多访客>>

博主相关

博客

微博

相册

收藏

留言

关于我

文章分类

全部博客 (1)

社区版块

存档分类

2011-11 ( 1)
更多存档...

最新评论

收藏列表

标题	标签	来源
python 正则表达式匹配中文utf8		python 正则表达式匹配中文utf8
>>> import re >>> x = u'中国'.encode('utf8') >>> s = x.__repr__() >>> s "'\\xc3\\x96\\xc3\\x90\\xc2\\xb9\\xc3\\xba'" >>> pattern = '.%s.' % s[1:-1] >>> pattern '.\\xc3\\x96\\xc3\\x90\\xc2\\xb9\\xc3\\xba.' >>> r = re.match(pattern,u'我是中国人'.encode('utf8')) >>> r <_sre.SRE_Match object at 0x012A52C0> >>> dir(r) ['__copy__', '__deepcopy__', 'end', 'expand', 'group', 'groupdict', 'groups', 'span', 'start'] >>> r.group() '\xc3\x8e\xc3\x92\xc3\x8a\xc3\x87\xc3\x96\xc3\x90\xc2\xb9\xc3\xba\xc3\x88\xc3\x8b' >>>
新闻抓取爬虫 PPT		新闻抓取爬虫 PPT
#encoding = utf8 import re import sys,os import re chars = [] def generate_chars(strList): """ 创建字典 """ for sr in strList: for char in sr: if char not in chars: chars.append(char) def vector(sr): """ 将字符串转换为向量空间 """ vec = [] for char in chars: if sr.find(char) !=-1: vec.append(1) else: vec.append(0) return vec def intersaction(str1,str2): """ 计算交集 """ v1 =vector(str1) v2= vector(str2) rv = 0 for i in xrange(len(v1)): rv += v1[i] * v2[i] return rv def detect_repeat(strList): """ 检测字符串字符匹配 """ chars = generate_chars(strList) #临界值 critical = 3 results = [] for st in strList: if all(intersaction(st,rt) < critical for rt in results): results.append(st) #返回检测结果 return results if __name__ =="__main__": strLists = [u"大家好啊",u"大家好",u"张靓颖新专辑",u"张靓颖专辑"] results = detect_repeat(strLists) print "result " + " > " * 20 for ru in results: print ru.encode("utf-8")
Python保存二进制数据到sqlite3		Python保存二进制数据到sqlite3
# coding: utf8 # Python2.6.2 import sqlite3 db = sqlite3.connect('test.db') cur = db.cursor() cur.execute("CREATE TABLE if not exists t (b BLOB);") with open('0.bin', 'rb') as f: cur.execute("insert into t values(?)", (sqlite3.Binary(f.read()), )) db.commit() cur.execute('select b from t limit 1') b = cur.fetchone()[0] with open('00.bin', 'wb') as f: f.write(b) db.close()
找email		关于 PYthon 抓取网页的问题
rawstr=r'^[a-z0-9A-Z_\-]{1,}@[a-z0-9A-Z_\-]{1,}\.[a-z0-9A-Z_\-.]{1,}$' pattern=re.compile(pattern) emails=pattern.findall(字符串)
httplib抓网页		httplib抓取UTF8编码的网页，将内容解码时出错
# -- coding: utf-8 -- import codecs import httplib import sys #reload(sys) #sys.setdefaultencoding('utf8') #print sys.getdefaultencoding() conn=httplib.HTTPConnection('www.douban.com',80) conn.request('GET', '/') resp = conn.getresponse() f = codecs.open('C:\\tmp\\web.log', 'w', 'utf8') f.write(resp.read().decode('utf8')) f.close() conn.close()
美丽的汤		Beautiful Soup 抓取网页的问题
import urllib2 from BeautifulSoup import BeautifulSoup,Tag import re import os import stat def analysis(url_str): page=urllib2.urlopen(url_str) soup = BeautifulSoup(page,fromEncoding="gb2312") #print unicode(soup.findAll("table")[4]) kk=soup.html.head.title return kk url_str="http://www.163.com" str_string=analysis(url_str) print str_string
抓中文网页乱码的解释		刚学python，抓中文网页遇到编码的问题，怎么转换也不行……
# -- coding: utf-8 -- import urllib2, htmllib, formatter class LinksExtractor(htmllib.HTMLParser): def __init__(self, formatter): htmllib.HTMLParser.__init__(self, formatter) self.links = [] self.archtexts = [] self.in_anchor = 0 def start_a(self, attrs): # process the attributes self.in_anchor = 1; if len(attrs) > 0 : for attr in attrs : if attr[0] == "href" : self.links.append(attr[1]) def end_a(self): self.in_anchor = 0 def handle_data(self, text): if self.in_anchor: text = text self.archtexts.append(text) def get_links(self) : return self.links #get html source request = urllib2.Request('http://www.baidu.com/') #request = urllib2.Request('http://localhost:8080/') request.add_header('User-Agent', 'Mozilla/5.0') opener = urllib2.build_opener() htmlSource = opener.open(request).read() format = formatter.NullFormatter() htmlparser = LinksExtractor(format) htmlparser.feed(htmlSource) htmlparser.close() links = htmlparser.get_links() for i in range(len(htmlparser.links)): temp = htmlparser.archtexts[i] print "url: %s, text: %s" % (htmlparser.links[i], temp)
通过pyODBC 连接mssql 配置后无法连接		通过pyODBC 连接mssql 配置后无法连接
import urllib2 from BeautifulSoup import BeautifulSoup,Tag import re import os import stat #import MySQLdb import pyodbc source = 'DRIVER={SQL Server};SERVER=.;DATABASE=test;UID=sa;PWD=123456' db = pyodbc.connect(source) cursor = db.cursor() cursor.execute('select url,term from html_url where term="103"') for i in cursor.fetchall(): url_str=str(i[0]) print url_str

Global site tag (gtag.js) - Google Analytics