|
| 1 | +# coding = utf-8 |
| 2 | +""" |
| 3 | +@author: zhou |
| 4 | +@time:2019/10/11 15:40 |
| 5 | +@File: analyse.py |
| 6 | +""" |
| 7 | + |
| 8 | +from pymongo import MongoClient |
| 9 | +import pandas as pd |
| 10 | +from pyecharts.charts import Bar |
| 11 | +from pyecharts import options as opts |
| 12 | +from wordcloud import WordCloud |
| 13 | +import jieba |
| 14 | +from PIL import Image |
| 15 | +import numpy as np |
| 16 | + |
| 17 | + |
| 18 | +job_conn = MongoClient( "mongodb://%s:%[email protected]:51612/boss" % ( 'boss', 'boss123')) |
| 19 | +job_db = job_conn.boss |
| 20 | +job_collection = job_db.boss |
| 21 | +details_collection = job_db.job_details |
| 22 | + |
| 23 | +job = pd.DataFrame(list(job_collection.find())) |
| 24 | +job.to_csv("job.csv", encoding='utf-8') |
| 25 | + |
| 26 | +job_detail = pd.DataFrame(list(details_collection.find())) |
| 27 | +job_detail.to_csv('job_detail.csv', encoding='utf-8') |
| 28 | + |
| 29 | +# 薪资水平 |
| 30 | +salary_distribute = job['salary'].value_counts() |
| 31 | +bar = Bar() |
| 32 | +bar.add_xaxis(salary_distribute.index.values.tolist()[:10]) |
| 33 | +bar.add_yaxis("", salary_distribute.values.tolist()[:10]) |
| 34 | +bar.set_global_opts( |
| 35 | + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30)), |
| 36 | + title_opts=opts.TitleOpts(title="薪资分布", subtitle="薪资分布前10"), |
| 37 | +) |
| 38 | +bar.render_notebook() |
| 39 | + |
| 40 | +# 高薪企业 |
| 41 | +height_salary = job[job['salary'] == '30-60K'] |
| 42 | +height_salary_year_edu = height_salary[['name', 'salary', 'year', 'edu']] |
| 43 | + |
| 44 | +# 职位要求 |
| 45 | +height_salary_index = height_salary.index.values.tolist() |
| 46 | +details = [] |
| 47 | +for index in height_salary_index: |
| 48 | + detail = job_detail[job_detail.index == index]['details'] |
| 49 | + details.append(detail.values.tolist()[0]) |
| 50 | +print(details) |
| 51 | + |
| 52 | + |
| 53 | +# 词云 |
| 54 | +stopworld = ('职位', '描述', '岗位职责', '岗位', '任职', '要求', '分项') |
| 55 | +font = r'C:\Windows\Fonts\FZSTK.TTF' |
| 56 | +def gen_wordcloud(data, pic, world_pic): |
| 57 | + tmpstr = '' |
| 58 | + for i in range(len(data) - 1): |
| 59 | + tmpstr += data[i] |
| 60 | + pseg = jieba.lcut(tmpstr) |
| 61 | + cut_word = '' |
| 62 | + for i in pseg: |
| 63 | + if i not in stopworld: |
| 64 | + cut_word += i |
| 65 | + img = Image.open(pic) |
| 66 | + img_array = np.array(img) |
| 67 | + wc = WordCloud(width=1800, height=1500, background_color='white', font_path=font, mask=img_array) |
| 68 | + wc.generate(cut_word) |
| 69 | + wc.to_file(world_pic) |
| 70 | + |
| 71 | + |
| 72 | +gen_wordcloud(details, 'money.jpg', 'money_wc.png') |
| 73 | + |
| 74 | +# 工作年限 |
| 75 | +year_distribute = job['year'].value_counts() |
| 76 | +bar = Bar() |
| 77 | +bar.add_xaxis(year_distribute.index.values.tolist()[:10]) |
| 78 | +bar.add_yaxis("", year_distribute.values.tolist()[:10]) |
| 79 | +bar.set_global_opts( |
| 80 | + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30)), |
| 81 | + title_opts=opts.TitleOpts(title="工龄分布"), |
| 82 | + # datazoom_opts=opts.DataZoomOpts(), |
| 83 | +) |
| 84 | +bar.render_notebook() |
| 85 | + |
| 86 | +# 学历要求 |
| 87 | +edu_distribute = job['edu'].value_counts() |
| 88 | +bar = Bar() |
| 89 | +bar.add_xaxis(edu_distribute.index.values.tolist()[:10]) |
| 90 | +bar.add_yaxis("", edu_distribute.values.tolist()[:10]) |
| 91 | +bar.set_global_opts( |
| 92 | + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-30)), |
| 93 | + title_opts=opts.TitleOpts(title="学历分布"), |
| 94 | + # datazoom_opts=opts.DataZoomOpts(), |
| 95 | +) |
| 96 | +bar.render_notebook() |
| 97 | + |
| 98 | + |
| 99 | +# 招生硕士企业 |
| 100 | +height_edu = job[job['edu'] == '硕士'] |
| 101 | +height_edu = height_edu[['name', 'salary', 'year', 'edu']] |
| 102 | + |
| 103 | +# 所有工作描述词云 |
| 104 | +all_job_detail = job_detail['details'].values.tolist() |
| 105 | +gen_wordcloud(all_job_detail, 'job.jpg', 'fulljob_wc.png') |
0 commit comments