Pandas

import pandas as pd   # pandas 패키지 불러오기
import numpy as np    # numpy 패키지 불러오기

pandas로 csv파일 불러오기

df = pd.read_csv('bank_customer.csv')

df #  실행시 전체 dataSet 출력

불러온 csv파일 확인

df.info()
# df.index()  # indext에 대한 세부정보 (! 기본RangeIndex는 안됨)  
# df.columns  # columns 세부정보
# df.values

9개의 column(index 제외) , column별 row, null값 여부, dataType 확인

df.head(10)  #  맨위에서 부터 10개 출력
# df.tail(10)  # 맨아래에서 부터 10개 출력

	cid	age	job	marital	education	default	balance	housing	loan
0	C00004	47	blue-collar	married	NaN	no	1506	yes	no
1	C00005	33	NaN	single	NaN	no	1	no	no
2	C00009	58	retired	married	1.0	no	121	yes	no
3	C00021	28	blue-collar	married	2.0	no	723	yes	yes
4	C00025	40	retired	married	1.0	no	0	yes	yes
5	C00026	44	admin.	married	2.0	no	-372	yes	no
6	C00027	39	management	single	3.0	no	255	yes	no
7	C00029	46	management	single	2.0	no	-246	yes	no
8	C00031	57	technician	married	2.0	no	839	no	yes
9	C00035	51	management	married	3.0	no	10635	yes	no

df.T # 데이터 전치 ( Columns <-> index )

df.sort_index(ascending=True) # ascending = False

데이터 오름차순,내림차순 정렬 (index 기준)

df.sort_values(by='age',ascending=False)

데이터 값 기준으로 정렬(특정 column 선택)

# df['age']
df[['age','cid']]  # 다중선택 => [[]]

# df.loc[[0,10]]

column 선택하기

df[0:10] # 0~9 행 슬라이스  ## df.loc[0:10]

	cid	age	job	marital	education	default	balance	housing	loan
0	C00004	47	blue-collar	married	NaN	no	1506	yes	no
1	C00005	33	NaN	single	NaN	no	1	no	no
2	C00009	58	retired	married	1.0	no	121	yes	no
3	C00021	28	blue-collar	married	2.0	no	723	yes	yes
4	C00025	40	retired	married	1.0	no	0	yes	yes
5	C00026	44	admin.	married	2.0	no	-372	yes	no
6	C00027	39	management	single	3.0	no	255	yes	no
7	C00029	46	management	single	2.0	no	-246	yes	no
8	C00031	57	technician	married	2.0	no	839	no	yes
9	C00035	51	management	married	3.0	no	10635	yes	no

슬라이스

# df[20130101 : 20150202]

range index뿐만 아니라 날짜 인덱스 등 다양한 형식 가능
만약 index가 2013-05-02 인 경우 pandas에서 자동으로 20130502로 인식

dataframe.loc[행,열]

loc[] 에 값을 하나만 넣는 경우 row 선택

# df.loc[3]        # index가 3인 row의 값들을 선택
# df.loc[0:3]      # index 0~2 슬라이싱
# df.loc[[0,3]]

# df.age >300
df.loc[ df.age > 300 ]   # boolean 활용 loc 슬라이싱

	cid	age	job	marital	education	default	balance	housing	loan
7788	C25900	380	management	single	3.0	no	1998	no	no

KiHyuk