# -*- coding: utf-8 -*- ''' Created on Fri Apr 14 22:04:07 2017 @author: Administrator ''' import urllib from bs4 import BeautifulSoup res = urllib.request.urlopen(“http://shijian.cc/jieri/”) soup = BeautifulSoup(res,”lxml”) book_div = soup.find(attrs={“class”:”col-md-6 col-sm-12″}) book_a = book_div.findAll(attrs={“class”:”primary-link”}) for book in book_a: print(book.string)
输出了这个网页提供的节日的国家…好像没什么用,不过好歹是成功了
import requests headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦) all_url = 'https://astr.ac.cn' ##开始的URL地址 start_html = requests.get(all_url,headers=headers) ##使用requests中的get方法来获取all_url(就是:http://www.mzitu.com/all这个地址)的内容 headers为上面设置的请求头、请务必参考requests官方文档解释 print(start_html.text) ##打印出start_html (请注意,concent是二进制的数据,一般用于下载图片、视频、音频、等多媒体内容是才使用concent, 对于打印网页内容请使用text) f=open(r't.txt','w') f.write(str(start_html.text)) f.close()
保存网站的源代码
from bs4 import BeautifulSoup import os,urllib def taonvlang(url): res=urllib.request.urlopen(url).read() data=res.decode() soup=BeautifulSoup(data,"lxml") path="D:/taonvlang/" if not os.path.isdir(path): #如果不存在该路径,则创建路径 os.makedirs(path) count=1 #用于给图片编号 for list in soup.find_all("img"): #获取img的所有内容 print(list) #img标签的所有内容 dict=list.attrs #将该字段转换为字典 print(dict) if "data-original" in dict: image=dict["data-original"] #取图片地址 # print(image) img=image[image.rfind(".")::] #取出文件扩展名 # print(img) image_path=str(count).zfill(5)+img filepath=os.path.join(path,image_path) with open(filepath,"wb") as f: image_data=urllib.request.urlopen(dict["data-original"]).read() f.write(image_data) count+=1 if __name__=="__main__": url="http://www.mzitu.com" taonvlang(url)
获取图片
Smilie Vote is loading.
import requests
from bs4 import BeautifulSoup
res=requests.get(“https://astr.ac.cn”)
res.encoding=”utf-8″
#print(res.text)
soup=BeautifulSoup(res.text)
#print(soup.select(“article”)[0])
for soup2 in soup.select(“article”):
title=soup2.select(“h2”)[0].text
time=soup2.select(“time”)[0].text
a=soup2.select(“a”)[0][“href”]
print(title,time,a)