04-代码封装整理
将网络相关的抽取成单独的一个文件
import requests
def do_json_net(url, headers=None):
"""
获取网络数据,返回base对象
:param url:
:param headers:
:return: json对象
"""
if not headers:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/87.0.4280.88 Safari/537.36"}
resp = requests.get(url=url, headers=headers).json()
return resp
创建用户类
方便后面登陆操作,创建一个用户类
class User:
"""
用户类
"""
def __init__(self, name, pwd, age, gender,id=0):
self.name = name
self.pwd = pwd
self.age = age
self.gender = gender
self.id = id
def __repr__(self):
return "<User>[name={},age={},gender={}]".format(self.name, self.age, self.gender)
数据库处理抽取
将方法抽取到类中
class DBTool:
"""
数据库操作类
"""
hot_word_table_sql = """CREATE TABLE IF NOT EXISTS hotwords(_id INTEGER PRIMARY KEY AUTO_INCREMENT,
title VARCHAR (200) NOT NULL,hotNum VARCHAR(200));"""
comment_info_sql = """CREATE TABLE IF NOT EXISTS commentInfo(_id INTEGER PRIMARY KEY AUTO_INCREMENT,
`key` VARCHAR(30) NOT NULL,`source` VARCHAR(200) NOT NULL);"""
video_sql = """CREATE TABLE IF NOT EXISTS video(_id INTEGER PRIMARY KEY AUTO_INCREMENT,id VARCHAR(30) NOT NULL,
title VARCHAR(300) NOT NULL,poster VARCHAR(300) NOT NULL,source_name VARCHAR(300) NOT NULL,play_url VARCHAR(300) NOT NULL,
playcnt VARCHAR(40) NOT NULL,mthid VARCHAR(30) NOT NULL,mthpic VARCHAR(300) NOT NULL,threadId VARCHAR(30) NOT NULL,
duration VARCHAR(30) NOT NULL,comment_id VARCHAR(30) NOT NULL,publish_time VARCHAR(30) NOT NULL,new_cate_v2 VARCHAR(30)
NOT NULL,`like` VARCHAR (40) NOT NULL,fmlike VARCHAR(30) NOT NULL,comment VARCHAR(30) NOT NULL,fmcomment VARCHAR(30)
NOT NULL,fmplaycnt VARCHAR(30) NOT NULL,fmplaycnt_2 VARCHAR(30) NOT NULL,outstand_tag VARCHAR(30) NOT NULL);"""
user_sql = """CREATE TABLE IF NOT EXISTS users(_id INTEGER PRIMARY KEY AUTO_INCREMENT,`name` VARCHAR (20) NOT NULL ,
`pwd` VARCHAR (20) NOT NULL ,`age` INT (3) NOT NULL ,`gender` VARCHAR (20));"""
def __init__(self):
self.db = pymysql.connect(host='192.168.3.35', port=3306, user='root', password='123456', database="video",
charset="utf8")
self.cursor = self.db.cursor()
self.init_db()
def init_db(self):
"""
初始化数据库
:return:
"""
self.cursor.execute(DBTool.hot_word_table_sql)
self.cursor.execute(DBTool.comment_info_sql)
self.cursor.execute(DBTool.video_sql)
self.cursor.execute(DBTool.user_sql)
def save_video(self, videos):
"""
保存数据
:param videos:
:param cursor:
:param db:
:return:
"""
pass
def save_hot_word(self, hotwords):
"""
存储热词
:param hotwords:
:return:
"""
pass
def save_user(self, user):
"""
存储用户
:param user:
:return: False 表示存储失败
"""
pass
def get_user(self, name, pwd):
"""
查询用户
:param name:
:param pwd:
:return:None 表示没有找到
"""
pass
def close(self):
"""
关闭数据库
:return:
"""
pass
数据抓取类修改
这部分修改不多,就是一些方法和导入的修改,不再贴出来了。
关于数据获取部分,分析了下网页的所有tab,进行了如下操作进行获取
if __name__ == '__main__':
# 抓取并存入数据
video_categories = ["recommend", "yingshi", "yinyue", "vlog", "youxi", "gaoxiao", "zongyi", "yule", "dongman",
"shenghuo", "guangchangwu", "meishi", "chongwu", "sannong", "junshi", "shehui", "tiyu", "keji",
"shishang", "qiche", "qinzi", "jiaoyu", "wenhua", "lvyou", "miaodong"]
db = DBTool()
hotwords = get_hot_words()
db.save_hot_word(hotwords)
for i in range(20):
for tab in video_categories:
data = get_video_data(tab)
db.save_video(data)
time.sleep(2)
db.close()
print("finish")
代码
参考github: https://github.com/huyuqiwolf/Video