'matplotlib' 태그의 글 목록

matplotlib

2-4 파이썬 팀프로젝트 : matplotlib, smtplib 메일 자동화

C.L.O.W.N 2021. 7. 19. 19:00

2021. 7. 19. 19:00

336x280(권장), 300x250(권장), 250x250, 200x200 크기의 광고 코드만 넣을 수 있습니다.

두번째 프로젝트 순서

1. 프로젝트 주제 정하기

2. 기획 및 데이터 수집, 전처리

3. 데이터 저장(판다스 열/행 관련 정리)

4. 시각화 및 자동화

프로젝트 마무리

매 10분마다 크롤링을 진행하고, 19시가 되면 이슈와 사설을 이메일로 보내준다. 자동화는 코드가 어디있는지 주섬주섬 다시 찾아봐야 한다 ㅠㅠ. 진행을 이슈파트 따로 사설파트 따로 해서, 사설은 어떤 식으로 코드 진행이 됐는지 잘 모르겠지만 나쁘지 않은 결과가 나왔다.

1
2
3
4
5
6
7
8
9
10
11
12
13

import os
from datetime import datetime
 
base_dir = 'C:/Workspace/project2_final/output/'
os.makedirs(base_dir, exist_ok=True)
 
from wordcloud import WordCloud
wc = WordCloud(font_path=r'C:\Windows\Fonts\MalgunBD.ttf',background_color="white",
               max_words=150, max_font_size=300, width=800, height=800)
cloud = wc.generate_from_frequencies(dict(sum_search))
 
cloud.to_file(base_dir+'IssueKeyWord '+ datetime.today().strftime('%Y%m%d') + '.png')
 
Colored by Color Scripter

cs

하루 일정이 마무리 되면 실시간 TOP100개의 키워드와 언급된 게시물 조회수의 비중에 따라 워드클라우드를 생성한다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

#### 상위 5개 키워드 ####
 
df = pd.read_excel(path2, header=[0,1], index_col=[0] )
df_col = list(df.columns.levels[0])
 
hit_top5 = []
 
for i in df_col :
    hit_max = df[i,'조회수'].max()
    hit_top5.append(hit_max)
 
hit_dict = dict(zip(df_col,hit_top5)) #딕셔너리 값으로 저장
 
keyword_top5 = sorted(hit_dict, key=hit_dict.get, reverse = True)[:5] #탑 5개 keyword 추출
print(keyword_top5)
Colored by Color Scripter

cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

#### 19시에 코드 보내기
 
import matplotlib.pyplot as plt
import seaborn as sns
 
title_font = {'fontsize': 16, 'fontweight': 'bold'}
plt.rc('font',family='Malgun Gothic')
plt.figure(figsize=[20,10])
plt.style.use('ggplot')
plt.show(block=False)
plt.pause(1)
plt.close()
 
ymd = today.strftime('%Y-%m-%d') 
hms = '19:00:00'
 
if hms == '19:00:00':
    path_png = "C:/Workspace/project2_final/output/graphs"
    if not os.path.isdir(path_png):                                                           
        os.mkdir(path_png)
    
    path_date = f"C:/Workspace/project2_final/output/graphs/{ymd}"
    if not os.path.isdir(path_date):                                                           
        os.mkdir(path_date)    
    
    ##### 그래프 조회수 변동 ######
    for i in keyword_top5 :
        plt.title (f"키워드 : '{i}' 조회수 변동", fontsize=20)
        df.index,df[i,'조회수'].plot( kind='bar')
        #    plt.bar(df.index,df[i,'조회수'])
        #plt.show()
        plt.savefig(f'{path_date}/조회수변동-({i}).png', bbox_inches='tight')
        plt.close()
                 #.plot( kind='bar')
 
    ##### 그래프 키워드 관심도 #####
    for i in keyword_top5 :
        text =f'{i} 키워드 관심도'
        sub_keys = df[i].groupby(['서브키워드'])['조회수'].mean().sort_values()
        plt.title(text, fontdict=title_font, loc='center', pad= 20)
        sub_keys.plot(kind='pie', autopct = '%1.1f%%', shadow = True, startangle=110 )
        #plt.show()
 
        plt.savefig(f'{path_date}/관심도-({i}).png')
        plt.close()
    
    print('그래프 시각화 완료')
Colored by Color Scripter

cs

19시가 되면 조회수 변동, 키워드에 따른 서브 키워드가 얼마나 변했는지 확인할 수 있다. 이거는 강의실에 와서 발표전 30분 데이터를 취합해서 만든 것이라 서브 키워드 변동도 거의 없고, 조회수 변동폭도 크지 않음을 확인할 수 있다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

import smtplib
from email.mime.multipart import MIMEMultipart;
# 메일의 본문 내용을 만드는 모듈
from email.mime.text import MIMEText;
# 메일의 첨부 파일을 base64 형식으로 변환
from email.mime.application import MIMEApplication;
# 메일의 이미지 파일을 base64 형식으로 변환(Content-ID 생성)
from email.mime.image import MIMEImage;
# 메일의 음악 파일을 base64 형식으로 변환(Content-ID 생성)
from email.mime.audio import MIMEAudio;
# 파일 IO
import io;
 
# 메일 서버와 통신하기 전에 메시지를 만든다.
data = MIMEMultipart();
# 송신자 설정
data['From'] = "본인의 이메일";
# 수신자 설정 (복수는 콤마 구분이다.)
data['To'] = "이메일1","이메일2";
# 메일 제목
data['Subject'] = "제목"
with open("C:\\workspace\\project2_final\\mailsource\\hi.png", 'rb') as fp:
    img = MIMEImage(fp.read(), Name = "hi.png")
    img.add_header('Content-ID', '<hi>')
    data.attach(img)
 
with open("C:\\workspace\\project2_final\\mailsource\\issue.png", 'rb') as fp:
    img = MIMEImage(fp.read(), Name = "issue.png")
    img.add_header('Content-ID', '<issue>')
    data.attach(img)
    
with open("C:\\workspace\\project2_final\\mailsource\\news.png", 'rb') as fp:
    img = MIMEImage(fp.read(), Name = "news.png")
    img.add_header('Content-ID', '<news>')
    data.attach(img)    
    
with open("C:\\workspace\\project2_final\\mailsource\\tw.png", 'rb') as fp:
    img = MIMEImage(fp.read(), Name = "tw.png")
    img.add_header('Content-ID', '<tw>')
    data.attach(img)    
    
with open("C:\\workspace\\project2_final\\mailsource\\fb.png", 'rb') as fp:
    img = MIMEImage(fp.read(), Name = "fb.png")
    img.add_header('Content-ID', '<fb>')
    data.attach(img)    
 
with open("C:\\workspace\\project2_final\\output\\IssueKeyWord {}.png".format(datetime.today().strftime('%Y%m%d')), 'rb') as fp:
    img = MIMEImage(fp.read(), Name = "wc1.png")
    img.add_header('Content-ID', '<wc1>')
    data.attach(img)
    
with open("C:\\workspace\\project2_final\\output\\KeyWord {}.png".format(datetime.today().strftime('%Y%m%d')), 'rb') as fp:
    img = MIMEImage(fp.read(), Name = "wc2.png")
    img.add_header('Content-ID', '<wc2>')
    data.attach(img)
    
 
Colored by Color Scripter

cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

# Html 형식의 본문 내용 (cid로 이미 첨부 파일을 링크했다.)
 
html = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
  <title>하루 이슈</title>
  <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
</head>
<body style="margin: 0; padding: 0;">
 <table align="center" border="0" cellpadding="0" cellspacing="0" width="600" style="border: 1px solid #cccccc;">
 <tr>
  <td align="center" bgcolor="#bdd7ee" style="padding: 40px 0 30px 0;">
 <img src="cid:hi" alt="Creating Email Magic" width="600" height="380" style="display: block;" />
</td>
 </tr>
 <tr>
  <td bgcolor="#ffffff" style="padding: 40px 30px 40px 30px;">
<table border="0" cellpadding="0" cellspacing="0" width="100%">
 <tr>
  <td font-family: 'Noto Sans KR', sans-serif;>
   <table border="0" cellpadding="0" cellspacing="0" width="100%">
 <tr>
  <td width="260" valign="top" font-family:Noto Sans KR, sans-serif; font-size: 16px; line-height: 20px;"> 
   <h2>커뮤니티 인기 키워드 100</h2>
   <img src="cid:wc1" alt="Creating Email Magic" width="250" height="250" style="display: block;" />
  </td>
  <td style="font-size: 0; line-height: 0;" width="20">
   &nbsp;
  </td>
  <td width="260" valign="top" font-family:Noto Sans KR, sans-serif; font-size: 16px; line-height: 20px;"> 
   <h2>사설 주요 키워드 100</h2>
   <img src="cid:wc2" alt="Creating Email Magic" width="250" height="250" style="display: block;" />
  </td>
 </tr>
</table>
  </td>
 </tr>
 <tr>
  <td style="padding: 20px 0 30px 0; font-family:Noto Sans KR, sans-serif; font-size: 16px; line-height: 20px;">
   <p>오늘 하루도 수고하셨습니다. 하이와 함께 행복한 하루를 마무리하세요!</p>
   <br>
   <br>
  </td>
 </tr>
 <tr>
  <td>
  <table border="0" cellpadding="0" cellspacing="0" width="100%">
 <tr>
  <td width="260" valign="top">
   <table border="0" cellpadding="0" cellspacing="0" width="100%">
    <tr>
     <td bgcolor="#3B89A3" align="center" style="padding: 20px 30px 20px 30px;">
      <img src="cid:issue" alt="" width="50%" height="100" style="display: block;" />
     </td>
    </tr>
    <tr>
     <td style="padding: 10px 0 30px 0; font-family:Noto Sans KR, sans-serif; font-size: 16px; line-height: 20px;"> 
     <h3 style="font-family:Noto Sans KR, sans-serif;">오늘의 커뮤니티 인기 키워드</h3>
      <details><p style="line-height:180%">{1}</p>
       <summary>{0}
       </summary>
      </details>
      <br>
      <details><p style="line-height:180%">{3}</p>
       <summary>{2}
       </summary>
      </details>
      <br>
      <details><p style="line-height:180%">{5}</p>
       <summary>{4}
       </summary>
      </details>
      <br>
      <details><p style="line-height:180%">{7}</p>
       <summary>{6}
       </summary>
      </details>
      <br>
      <details><p style="line-height:180%">{9}</p>
       <summary>{8}
       </summary>
      </details>
     </td>
    </tr>
   </table>
  </td>
  <td style="font-size: 0; line-height: 0;" width="20">
   &nbsp;
  </td>
  <td width="260" valign="top">
   <table border="0" cellpadding="0" cellspacing="0" width="100%">
    <tr>
     <td bgcolor="#3B89A3" align="center" style="padding: 20px 30px 20px 30px;">
      <img src="cid:news" alt="" width="50%" height="100" style="display: block;" />
     </td>
    </tr>
    <tr>
     <td style="padding: 10px 0 30px 0; font-family:Noto Sans KR, sans-serif; font-size: 16px; line-height: 20px;"> 
     <h3 style="font-family:Noto Sans KR, sans-serif;">오늘의 사설 주요 키워드</h3>
      <details><p style="line-height:180%">{11}</p>
       <summary>{10}
       </summary>
      </details>
      <br>
      <details><p style="line-height:180%">{13}</p>
       <summary>{12}
       </summary>
      </details>
      <br>
      <details><p style="line-height:180%">{15}</p>
       <summary>{14}
       </summary>
      </details>
      <br>
      <details><p style="line-height:180%">{16}</p>
       <summary>오늘 자주 등장한 키워드
       </summary>
      </details>
      
     </td>
    </tr>
   </table>
  </td>
 </tr>
</table>
  </td>
 </tr>
</table>
</td>
 </tr>
 <tr>
  <td bgcolor="#bdd7ee" style="padding: 30px 30px 30px 30px;">
 <table border="0" cellpadding="0" cellspacing="0" width="100%">
 <tr>
 <td width="75%"; style="color: #000000; font-family: Arial, sans-serif; font-size: 14px;">
 &reg; 하루이슈, Hi 2021<br/>
 <a href="" style="color: #000000;"><font color="#000000">Unsubscribe</font></a> to this newsletter instantly
</td>
  <td align="right">
 <table border="0" cellpadding="0" cellspacing="0">
  <tr>
   <td>
    <a href="http://www.twitter.com/">
     <img src="cid:tw" alt="Twitter" width="38" height="38" style="display: block;" border="0" />
    </a>
   </td>
   <td style="font-size: 0; line-height: 0;" width="20">&nbsp;</td>
   <td>
    <a href="http://www.facebook.com/">
     <img src="cid:fb" alt="Facebook" width="38" height="38" style="display: block;" border="0" />
    </a>
   </td>
  </tr>
 </table>
</td>
 </tr>
</table>
</td>
 </tr>
</table>
</body>
 
</html>
""".format(keyword_top5[0], articlestr1, keyword_top5[1], articlestr2, keyword_top5[2], articlestr3, keyword_top5[3], articlestr4, keyword_top5[4], articlestr5, sendingKeywords[0], content1, sendingKeywords[1], content2, sendingKeywords[2], content3, dfCSV["text"].tolist())
 
 
msg = MIMEText(html, 'html')
 
 
# 메시지를 확인한다.
# Data 영역의 메시지에 바운더리 추가
data.attach(msg);
print(data);
# 메일 서버와 telnet 통신 개시
server = smtplib.SMTP_SSL('smtp.naver.com',465);
#server = smtplib.SMTP('smtp.gmail.com',587);
# 메일 통신시 디버그
server.set_debuglevel(1);
# 헤로 한번 해주자.(의미 없음)
server.ehlo();
# tls 설정 주문 - tls 587 포트의 경우
#server.starttls();
# 헤로 또 해주자.(의미 없음)
server.ehlo();
# 로그인 한다.
server.login("ID", "P/W!"); # 아이디, 패스워드 입력
# 심심하니 또 헤로 해주자.(의미 없음)
server.ehlo();
# MAIL(송신자) 설정
sender = data['From'];
# RCPT(수신자), 리스트로 보낸다.
# 수신자 추가
receiver = data['To'].split(",");
# # 참조자 추가
# if data['Cc'] is not None:
#     receiver += data['Cc'].split(",");
# # 숨은 참조자 추가
# if data['Bcc'] is not None:
#     receiver += data['Bcc'].split(",");
# 메일 프로토콜 상 MAIL, RCPT, DATA 순으로 메시지를 보내야 하는데 이걸 sendmail함수에서 자동으로 해준다.
server.sendmail(sender, receiver, data.as_string());
# QUIT을 보내고 접속을 종료하고 메일을 보낸다.
server.quit();
 
Colored by Color Scripter

cs

'도전하자. 프로젝트' 카테고리의 다른 글

3-2 파이썬 팀프로젝트 CNN 모델링 - 인공지능, 머신러닝, 딥러닝 뭔데? (0)	2021.08.01
3-1 파이썬 팀프로젝트 CNN 식물 병충해 분류 (0)	2021.07.31
2-3 파이썬 팀프로젝트 : 데이터 저장, 판다스 열/행 관련 총정리 + 폴더/파일 생성 (0)	2021.07.18
2-2 파이썬 프로젝트 : BS4 웹크롤링, 형태소 분석, (0)	2021.07.17
2-1 파이썬 팀프로젝트 : 네이버 실시간검색 대체 이슈 예측 기획 (0)	2021.07.16

1-5. 파이썬 EDA 데이터 분석 팀 프로젝트 간단 정리

C.L.O.W.N 2021. 6. 26. 22:39

2021. 6. 26. 22:39

336x280(권장), 300x250(권장), 250x250, 200x200 크기의 광고 코드만 넣을 수 있습니다.

- 첫 프로젝트 글 순서 -
1. 파이썬(python) EDA 데이터분석 주제 정하기
2. 실패한 여기어때 후기 웹스크래핑(web scraping)
3. 데이터 수집 방법 & 데이터 추출, 정제
4. 판다스(pandas) 데이터 처리 / Matplotlib, Json 시각화
5. 정리

지난 번에 글을 올린지 딱 2달이 지났다. 이후에 프로젝트를 벌써 3개를 더했는데... 틈만 나면 해야지 해야지 하는데 시간이 많이 나지를 않았다. 머신러닝, 인공지능 등 개념학습 따라가기도 너무 벅찼고, 모델은 아직까지 이해도 안 되는 수준이긴 하지만 더 늦어지면 안 될 것 같아서 미리미리 간단하게 쓰기로 했다.

솔직히 EDA 데이터 분석 팀프로젝트 5번째 항목 정리를 왜 했지라는 생각이 든다. 그래서 간단하게 어떻게 진행을 했고 어떤 것을 사용했는지, 결론은 어떻게 도출했는지만 간단하게 쓰려고 한다. 부지런하게 남은 프로젝트들도 업로드하면서 github도 사용해봐야 하는데 쉽지 않다. 개인적으로 마케팅 분석하고 sns까지 하다보니 시간이 남지 않는다. 해커톤까지 ???

나중에 알게 된건데 필수로 써야하는 부분이 몇개 있다.

1. 언어(파이썬, 자바, C 등..), 패키지(matplolib, pandas 등), 툴(주피터노트북, 코랩, 파이참 등..)
2. 일정표
3. 과정, 순서 등

이번에 주로 사용했던 것은 pandas, matplolib, json을 사용하였다. sns도 사용하려고 했는데 당시에는 생각보다 쉽지 않아서 matplotlib으로 데이터 시각화를 대부분 담당했다. 나중에도 시각화하는데 많이 사용하기 때문에 알아두는 것이 좋다.

과정, 순서 등은 어떤 부분에서 얼만큼의 시간을 썼는지 쓰면 좋을 것이다.

프로젝트를 할 때 주제가 많이 제한적이게 된 이유는 데이터를 구하기 너무 어렵다. 예전에 데이터가 곧 권력이고 힘이라고 했던 부분, 그리고 스프링쿨러? 온도, 습도 등의 데이터를 모아놨던 회사가 구글에 데이터를 고가에 팔았다는 것, 머신러닝을 하면서 더 느끼게 됐다.

데이터를 토대로 결론 도출을 해봤다
- 창업시 고려해야할 위치와 업종을 데이터를 통해 파악하기 쉽다.
- 업종에서는 소매업이 가장 많고, 그 다음이 음식점, 그리고 도민의 소매업이 평상시에도 많다.
- 성수기 때 슈퍼마켓, 체인화 편의점 이용률이 많이 늘어났다. 숙박업을 할 경우 미리 물품 구매를 하면 더 좋을 것이다.

=> 하고 싶은 업종을 입력했을 때, 위치, 경쟁, 타겟층을 보여주는 프로그램을 보여줘도 나쁘지 않을 것 같다.

'도전하자. 프로젝트' 카테고리의 다른 글

2-2 파이썬 프로젝트 : BS4 웹크롤링, 형태소 분석, (0)	2021.07.17
2-1 파이썬 팀프로젝트 : 네이버 실시간검색 대체 이슈 예측 기획 (0)	2021.07.16
1-4. 파이썬 EDA 데이터 분석 팀 프로젝트 판다스, 시각화 (Matplotlib, Json) (0)	2021.04.26
1-3. 파이썬 EDA 데이터분석 팀 프로젝트 데이터 수집, 정제 등 (0)	2021.04.26
1-2. 파이썬 EDA 데이터분석 팀 프로젝트 웹스크래핑 (Beautifulsoup&Requests) (0)	2021.04.16

1-4. 파이썬 EDA 데이터 분석 팀 프로젝트 판다스, 시각화 (Matplotlib, Json)

C.L.O.W.N 2021. 4. 26. 19:21

2021. 4. 26. 19:21

336x280(권장), 300x250(권장), 250x250, 200x200 크기의 광고 코드만 넣을 수 있습니다.

- 첫 프로젝트 글 순서 -

1. 파이썬(python) EDA 데이터분석 주제 정하기

2. 실패한 여기어때 후기 웹스크래핑(web scraping)

3. 데이터 수집 방법 & 데이터 추출, 정제

4. 판다스(pandas) 데이터 처리 / Matplotlib, Json 시각화

5. 정리

데이터를 어떤 식으로 그룹핑을 할지 고민을 하다가 cost(비용)으로 하기로 했다.

-큰 틀에서 묶기-

제주시/서귀포시 매출 비교

업종별 매출

성별/연령별 소비

개별/단체 소비

성수기/비수기

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

# 월별 / 남성 / 나이/ 종류 별 금액
sum_stores =  df.groupby(['date','sex','age','stores'])['cost'].sum()
print(sum_stores.plot(kind='bar', rot=90) )
 
#성수기 비수기
sum_sb = df.groupby('date')['cost'].sum()
 
 
#업종별 매출 
sum_stores = df.groupby('stores')['cost'].sum()
 
 
#성별, 연령별 소비 순위
sum_sa = df.groupby(['sex','age'])['cost'].sum()
sum_sa
 
#성별 업종 이용현황, 연령별 업종 이용현황, 성별&연령별 업종 이용현황
sum_st = df.groupby(['sex','stores'])['cost'].sum()
sum_at = df.groupby(['age','stores'])['cost'].sum()
sum_sat = df.groupby(['sex','age','stores'])['cost'].sum()
 
#제주시 서귀포시 소비 비교 / 읍면동별 소비 비교
sum_city = df.groupby(['city'])['cost'].sum()
sum_city2 = df.groupby(['city2'])['cost'].sum()
 
#제주시 서귀포시 업종별 소비 비교 / 읍면동 업종별 소비 비교
sum_ct = df.groupby(['city','stores'])['cost'].sum()
sum_ct2 = df.groupby(['city2','stores'])['cost'].sum()
 
#지역 각 성별 업종 이용률
sum_cst2 = df.groupby(['city2','sex','stores'])['cost'].sum()
 
#개별과 단체 총금액 비교
sum_visitor = df.groupby(['visitor'])['cost'].sum()
 
#개별 단체 성별 총금액 비교 / 개별 단체 업종별 총금액 비교 / 단체 or 개별에서 이용자별 금액비교
sum_vs = df.groupby(['visitor, sex'])['cost'].sum()
sum_vt = df.groupby(['visitor, stores'])['cost'].sum()
sum_vu = df.groupby(['visitor, user'])['cost'].sum()
 
#개별(내국인, 법인, 외국인) / 단체에서 이용하는 업종
sum_vut = df.groupby(['visitor, user, stores'])['cost'].sum()

cs

우선 데이터가 어떻게 보여지는지 기본적인 시각화를 진행했다. 만약 세부정보가 필요하면, 세분화해서 EDA데이터 분석하는데 유용하게 활용했다.

1
2
3
4
5
6

#특정 업종, 날짜
sum_sdt = df[(df.stores == '소매업')  & (df.date == 'Feb-17')].groupby('type')['cost'].sum()
 
#특정 연령, 
sum_ast = df[df.age==10].groupby(['sex','type'])['cost'].sum().sort_values().tail(20)
 
Colored by Color Scripter

cs

열에서 특정 연령, 성별, 업종만 뽑아내서 더하고 싶으면, df[df.열 == '열 속성'].groupby를 이용해주면 된다. 특정 열에서 속성을 추출해서 더하는 코드가 나오지 않아서 참고하면 좋을 것 같다.

1
2
3
4
5

plt.rc('font',family='Malgun Gothic')
plt.figure(figsize=[20,10])
plt.style.use('ggplot')
sum_sa = df.groupby(['sex','age'])['cost'].sum()
sum_sa.plot(kind='bar') 

cs

matplotlib 함수를 활용하여 가장 기본적인 시각화를 했다. 
 
 
 

얘네를 모아서 EDA 데이터 분석에 필요한 자료만 모으고, title, x축, y축, 그래프 색, 그래프 종류(box, bar, barh, violinplot, pie) 등으로 다시 시각화 과정을 진행했다.

1
2
3
4
5
6
7
8
9
10

plt.figure(figsize=[10,5])
title_font = {'fontsize': 16, 'fontweight': 'bold'}
plt.title('도민 소매업 매출 순위', fontdict = title_font, loc = 'center', pad =20)
sum = df[(df.stores=='소매업') & (df.visitor=='도민')].groupby('type')['cost'].sum()
ax = plt.subplot()
ax.set_xticklabels(['0','500억', '1000억', '1500억', '2000억', '2500억', '800억'])
sum.plot(kind = 'barh', color = '#f39189')
plt.xlabel('Total Amount')
plt.ylabel('Type of Business')
sum_sa
Colored by Color Scripter

cs

barh 막대 그래프

1
2
3
4
5
6

explode = [ 0.1, 0.1,0.1, 0.1,0.1,0,0 ]
title_font = {'fontsize': 16, 'fontweight': 'bold'}
plt.title('업종별 매출', fontdict=title_font, loc='center', pad= 20)
sum_sa = df.groupby('stores')['cost'].sum().sort_values()
sum_sa.plot(kind='pie',y='sum', autopct = '%1.1f%%',explode=explode, startangle=0)
 
Colored by Color Scripter

cs

 
pie 그래프
 
 
 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import folium
from folium import plugins
from folium.plugins import MarkerCluster
import json
 
df1= pd.read_csv("jeju_card.csv",engine='python', encoding = 'euc-kr', header=0,
                names=['date','city', 'city2', 'type', 'user', 'visitor', 'age', 'sex', 'cost', 'stores'])
         # 카드 이용금액 csv
df2 = pd.read_csv("jeju_lalo.csv",engine='python', encoding = 'euc-kr')
#df2 = 제주도의 행정구역을 위도 및 경도로 나눈 csv
df3 = pd.read_csv("jeju_cost.csv",engine='python', encoding = 'euc-kr')
#df3 = 구역별 소비를 나타낸 csv
df = pd.DataFrame(df1)
 
lat = df2['위도'].mean()
long = df2['경도'].mean()
m = folium.Map([lat,long], zoom_start=9)
# lay, long의 평균값으로 지도의 첫 화면 지정 / 줌lv = 9
a = df.groupby(['city2'],  as_index = False)['cost'].sum()
# city2 컬럼을 그룹바이하고 그 중의 cost컬럼을 모두 sum.
# as_index = False : 그룹 바이 한 데이터 프레임에 index 번호를 추가해준다.
 
#---------------------------제이슨 표시---------------------------
#제주도의 행정구역을 시각화로 나눠주는 geojson road
with open("Jeju_json3.geojson", mode='rt',encoding='utf-8') as f:
    jeju_json = json.loads(f.read())
    f.close()
folium.GeoJson(jeju_json, name='jeju_haejoeng').add_to(m)
#------------------------------------------------------------------
 
#---------------------------제이슨 조건---------------------------
folium.Choropleth(geo_data = jeju_json,         # geo_data load
                  data = df3,                 # 위치를 표현해줄 데이터 load
                  columns = ('읍면동', '이용금액'),     # ('지도에 표현할 컬럼', '비교할 컬럼')
                  key_on = 'feature.properties.adm_nm',     # geo_data load 후 데이터를 어디서 받을지
                  fill_color = 'YlGn',             # 비교해서 나타낼 색 / 노란색/초록색
                  fill_opacity = 0.8,             #
                  line_opacity = 0.5,             #
                  legend_name = '지역별 매출액'         # 우측 상단에 나오는 bar 이름
                  ).add_to(m)
#------------------------------------------------------------------
 
marker_cluster = MarkerCluster().add_to(m)         # 마커 클러스터 추가 (현재 코드엔 사용 안함.)
for i in range(len(a.index)):
    b = df2[df2['읍면동'] == a.iloc[i,0]]         # df2 읍면동 컬럼에서 a의 [i,0] 값과 같은것을 추출
    sub_lat = b.iloc[0,4]                 # 
    sub_long = b.iloc[0,5]                 # 위, 경도 설정
    title = b.iloc[0,2]                 # title 설정
 
    folium.Marker([sub_lat,sub_long],tooltip = title).add_to(m) 
    # 마커 설정 마커클러스터를 실행시키려면 add_to(marker_cluster)
 
#folium.LayerControl(collapsed=False).add_to(m)     # 그룹화된 기능들을 사용하기위해 컨트롤러 설정
m.save('jeju.folium.html')
 
 
Colored by Color Scripter

cs

제주도 marker

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import folium
from folium import plugins
from folium.plugins import MarkerCluster
import json
 
# json 도시별 행정구역 구분 sggnm
df1= pd.read_csv(r"C:\Users\rhdud\Desktop\py_team\read_file\123123.csv",engine='python', encoding = 'euc-kr', header=0,
                names=['date','city', 'city2', 'type', 'user', 'visitor', 'age', 'sex', 'cost', 'stores'])
df2 = pd.read_csv(r"C:\Users\rhdud\Desktop\py_team\read_file\jeju_lalo.csv",engine='python', encoding = 'euc-kr')
#df2 = 제주도 행정구역 위도 및 경도로 나눈 csv
df3 = pd.read_csv(r"C:\Users\rhdud\Desktop\py_team\read_file\jeju_cost.csv",engine='python', encoding = 'euc-kr')
df = pd.DataFrame(df1)
 
 
lat = df2['위도'].mean()
long = df2['경도'].mean()
m = folium.Map([lat,long],zoom_start=9)
# lay, long의 평균값으로 지도의 첫 화면 지정 / 줌lv = 9
a = df.groupby(['city2','sex','stores'],  as_index = False)['cost'].sum()
'''
#---------------------------폴리움 그룹화---------------------------
fg = folium.FeatureGroup(name = '도시별 표현')
m.add_child(fg)
g1 = plugins.FeatureGroupSubGroup(fg, '제주시')
m.add_child(g1)
g2 = plugins.FeatureGroupSubGroup(fg, '서귀포시')
m.add_child(g2)
g3 = plugins.FeatureGroupSubGroup(fg, 'Json으로 표현')
m.add_child(g3)
#------------------------------------------------------------------
'''
#---------------------------제이슨 표시---------------------------
with open(r"C:\Users\rhdud\Desktop\py_team\read_file\Jeju_json.geojson", mode='rt',encoding='utf-8') as f:
    jeju_json = json.loads(f.read())
    f.close()
folium.GeoJson(jeju_json, name='jeju_haejoeng').add_to(m)
#------------------------------------------------------------------
 
#---------------------------제이슨 조건---------------------------
key_by = 'feature.properties.adm_nm'
folium.Choropleth(geo_data = jeju_json,
                  data = df3,
                  columns = ['읍면동', '이용금액'],
                  key_on = key_by[key_by.rfind(' ')+1 :],
                  fill_color='YlGn',
                  fill_opacity=0.8
                  ).add_to(m)
#------------------------------------------------------------------
 
m.save('jeju_data1.html')
Colored by Color Scripter

cs

제주도 choropleth

'도전하자. 프로젝트' 카테고리의 다른 글

2-1 파이썬 팀프로젝트 : 네이버 실시간검색 대체 이슈 예측 기획 (0)	2021.07.16
1-5. 파이썬 EDA 데이터 분석 팀 프로젝트 간단 정리 (2)	2021.06.26
1-3. 파이썬 EDA 데이터분석 팀 프로젝트 데이터 수집, 정제 등 (0)	2021.04.26
1-2. 파이썬 EDA 데이터분석 팀 프로젝트 웹스크래핑 (Beautifulsoup&Requests) (0)	2021.04.16
1-1. 파이썬 EDA 탐색적 데이터분석 프로젝트, 마케팅 관점에서 생각하기 (0)	2021.04.15

PREV 이전 1 NEXT 다음

✔굿모닝 IT ✔

matplotlib

2-4 파이썬 팀프로젝트 : matplotlib, smtplib 메일 자동화

'도전하자. 프로젝트' 카테고리의 다른 글

1-5. 파이썬 EDA 데이터 분석 팀 프로젝트 간단 정리

'도전하자. 프로젝트' 카테고리의 다른 글

1-4. 파이썬 EDA 데이터 분석 팀 프로젝트 판다스, 시각화 (Matplotlib, Json)

'도전하자. 프로젝트' 카테고리의 다른 글

+ Recent posts

티스토리툴바