| by 鲍 建伟 | 3 comments

爬取垃圾分类信息代码

不多说,先上代码:

import requests
from bs4 import BeautifulSoup
import sqlite3

def get_your_input():
    your_target = input("请输入需要查询的垃圾:")
    your_target = your_target.split()
    if your_target:
        return str(your_target)[2:-2]
    else:
        return 0

def url_joint(your_target):
    url = 'http://lajifenleiapp.com/sk/' + str(your_target)
    return url

def get_source_code(url):
    req = requests.get(url)
    # print(req.text)
    return req.text

def data_parser(req):
    bs = BeautifulSoup(req, 'html.parser')
    if bs.find_all('h1', {'style': 'text-align: left;'}):
        data = bs.find_all('h1', {'style': 'text-align: left;'})
        if len(data) != 0:
            data = data[0].contents
            result = data[2].text
            # res = re.findall('[\u4e00-\u9fa5]+', data)
            # result = ''.join(res)
        return result
    else:
        return '未查询到数据'

def db_create():
    if sqlite3.connect('rubbish.db'):
        conn = sqlite3.connect('rubbish.db')
        print('数据库连接成功!')
        return conn
    else:
        print('数据库打开失败!')

def tb_create(conn):
    try:
        cur = conn.cursor()
        cur.execute('''CREATE TABLE RUBBISH_sheet (Name varchar(255), Category varchar(255));''')
        conn.commit()
        print("创建垃圾表成功!")
    except sqlite3.OperationalError as e:
        print('table RUBBISH_sheet已经存在。')

def tb_insert(conn, rubbish_tuple):
    cur = conn.cursor()
    cur.execute("INSERT INTO RUBBISH_sheet (Name,Category) VALUES ('{}', '{}')".format(rubbish_tuple[0], rubbish_tuple[1]))
    conn.commit()
    print("插入数据成功!")

def read_file_to_list():
    rubbsih_name = []
    f = open("rubbishname.txt", encoding='UTF-8')  # 返回一个文件对象
    line = f.readline()  # 调用文件的 readline()方法
    while line:
        rubbsih_name.append(line)
        line = f.readline()
    f.close()
    return rubbsih_name

if __name__ == '__main__':
    rubbsih_list = read_file_to_list()
    conn = db_create()
    for i in rubbsih_list:
        # target = get_your_input()
        i = i.replace('\n', '')
        url = url_joint(i)
        req = get_source_code(url)
        data = data_parser(req)
        print(data)
        rubbish_tuple = (i, data)
        tb_create(conn)
        tb_insert(conn, rubbish_tuple)
    # conn.close()

将rubbishname.txt文件放到代码目录下,此文件记录垃圾名字,例子如下图:

3 Comments

大屌辉

7月 7, 2019, 2:33 下午 回复

牛比 我射了,老鲍你呢

Robertmal

8月 8, 2019, 4:36 上午 回复

Hello! imbobby.net

We make offer for you

Sending your commercial offer through the feedback form which can be found on the sites in the Communication partition. Feedback forms are filled in by our program and the captcha is solved. The profit of this method is that messages sent through feedback forms are whitelisted. This method improve the probability that your message will be read.

Our database contains more than 25 million sites around the world to which we can send your message.

The cost of one million messages 49 USD

FREE TEST mailing of 50,000 messages to any country of your choice.

This message is automatically generated to use our contacts for communication.

Contact us.
Telegram – @FeedbackFormEU
Skype FeedbackForm2019
Email – FeedbackForm@make-success.com
WhatsApp – +44 7598 509161

Anthony Russell

9月 9, 2019, 12:26 下午 回复

Good day,

I am reaching out to you based on a request from a profiled client who is looking for a potential investment opportunity within your scope of business .

Details of investment proposal will be sent out to you on reading back from you as we deem it necessary to seek for your consent prior to any formal exchange of material information relating to the Subject matter .

I look forward to your earliest response , please do contact me directly only via my private email address stated below .

Kind Regards,

Anthony Russell
Managing Partner
Tel Line: +447440934362
Email : anthonyrussell@deximinvestmentsolutionsukltd.com

发表评论