##################################################
#qq:316118740 #BLOG:http://hi.baidu.com/alalmn # ���� ��ȡ��ҳ�е����ӵ�ַ ���ж��Ƿ���HTTP��ַ # ��ѧд�IJ������Ҽ��� ################################################## def URL_STR(data):#�ж��Ƿ���HTTP�ַ� sStr2 = 'http://' sStr3 = 'https://' #print sStr1.find(sStr2) if data.find(sStr2) and data.find(sStr3): return 1 #print "û���ҵ�" else: return 0 #print "���ҵ���" ################################################## import urllib2, re def URL_DZ(URL): #����ҳ��ĵ�ַ s = urllib2.urlopen(URL) #s = urllib2.urlopen(r"http://www.163.com") ss = s.read() p = re.compile( r'<a.+?href=.+?>.+?</a>' ) pname = re.compile( r'(?<=>).*?(?=</a>)' ) phref = re.compile( r'(?<=href\=\").*?(?=\")') #���켰����������ʽ sarr = p.findall(ss) #�ҳ�һ��һ����<a></a>��ǩ i=0 for every in sarr: if i>1000: print "����1000��URL��ַ������������\n" break else: i+=1 sname = pname.findall( every ) if sname: sname = sname[0] shref = phref.findall( every ) if shref: shref = shref[0] #print sname.decode( 'gbk' ), "\n" #��ȡ�������� #print shref #��ȡURL if URL_STR(shref): print shref,"û���ҵ��Ƿ�HTTP��ַ" else: print shref #"���ҵ�����ȷURL��ַ" # �����ǽ�ÿ��<a></a>��������ݺ͵�ַ��ƥ����� ################################################## URL_DZ("http://www.baidu.com") |
标签:
正则获取网页地址