pandas

대분류

라이브러리

소분류

Python Module

유형

데이터

조작

정제

분석

시각화

주요 레퍼런스

https://velog.io/@euisuk-chung/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%9C%EA%B0%81%ED%99%94-%EB%A7%88%EC%8A%A4%ED%84%B0%ED%95%98%EA%B8%B0-Pandas

모듈 분류

외장

최종 편집 일시

2024/10/27 15:31

생성 일시

2024/07/19 06:15

13 more properties

설명

파이썬 데이터 분석 라이브러리 데이터 조작, 정제, 분석, 시각화 등을 위한 다양한 기능을 제공

설치

> python -m pip install --upgrade pip
> pip install pandas

import numpy as np #짝꿍
import pandas as pd 
SQL
복사

데이터 구조

pd.Series() → Series객체

•

1차원 구조로 되어 있는 데이터

pd.Series(data=None, index=None, dtype=None, name=None, copy=False)
Python
복사

Series 생성

data = {'a':1, 'b':2, 'c':3} # 딕셔너리
pd.Series(data=data, dtype=np.int16, name='dict')
     
# a    1
# b    2
# c    3
# Name: dict, dtype: int16

# 스칼라 값인 경우 인덱스를 제공해야 함
pd.Series(5.0, index=['a', 'b', 'c', 'd', 'e'])
     
# a    5.0
# b    5.0
# c    5.0
# d    5.0
# e    5.0
# dtype: float64

# np.random.randn: 가우시안 정규분포 난수
s = pd.Series(data=np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s
     
# a   -1.136282
# b   -0.274201
# c    1.846150
# d   -0.157367
# e   -0.433985
# dtype: float64
Python
복사

슬라이싱/인덱스

•

Numpy와 같이 슬라이싱/인덱스과 같은 작업 가능

s[0]
# 1.3442340044624743

s[:3  
# a    1.344234
# b   -0.388342
# c    0.602582
# dtype: float64

s[s>s.median()], s.median()
# (a    1.344234
#  d    2.037618
# dtype: float64, 0.6025819149869053)

'a' in s
# True

't' in s
# False

s.get('rr'
# 값 없음

s.get('a', 'None')
# -1.1362818047746768


# a   -1.136282
# b   -0.274201
# c    1.846150
# d   -0.157367
# e   -0.433985
# dtype: float64

s[0]
# -1.1362818047746768

s[[4,0,1]], a[[4,0,1]], c[[4,2,1]]
# (e   -0.433985
#  a   -1.136282
#  b   -0.274201
#  dtype: float64, e   -0.433985
#  a   -1.136282
#  b   -0.274201
#  dtype: float64, e   -0.433985
#  c    1.846150
#  b   -0.274201
#  dtype: float64)

lst = [4,2,1]  
########3
# 작업
####
a = s 
c = s
s[lst], a[lst], c[lst]
# (e   -0.433985
#  c    1.846150
#  b   -0.274201
#  dtype: float64, e   -0.433985
#  c    1.846150
#  b   -0.274201
#  dtype: float64, e   -0.433985
#  c    1.846150
#  b   -0.274201
#  dtype: float64)

s['a']
# -1.1362818047746768

s['a'] = 1.5
s
# a    1.500000
# b   -0.274201
# c    1.846150
# d   -0.157367
# e   -0.433985
# dtype: float64

s[[0,1,2]] = [0,1,2]
s
# a    0.000000
# b    1.000000
# c    2.000000
# d   -0.157367
# e   -0.433985
# dtype: float64
Python
복사

abs(), exp()

np.abs(s) # 절대값 계산
# a    0.000000
# b    1.000000
# c    2.000000
# d    0.157367
# e    0.433985
# dtype: float64

np.exp(s) # 자연 상수 e^s[~]인 지수함수로 변환 
# a    1.000000
# b    2.718282
# c    7.389056
# d    0.854391
# e    0.647922
# dtype: float64
Python
복사

numpy로 변환

s.to_numpy()
     
array([ 1.344234  , -0.38834187,  0.60258191,  2.03761822, -0.5560486 ])
Python
복사

pd.DataFrame() → DataFrame 객체

•

행과 열로 이루어진 2차원 데이터를 다루기 위한 객체

•

열은 각각의 변수를 나타내고, 행은 각각의 관측치를 나타냄

DataFrame 생성

data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}

df = pd.DataFrame(data=data)
df
Python
복사

리스트 → DataFrame

data = [['A', 1], ['B', 2], ['C', 3]]
df = pd.DataFrame(data, columns=['col1', 'col2'])
print(df)
# 출력 결과
#   col1  col2
# 0    A     1
# 1    B     2
# 2    C     3
Python
복사

딕셔너리 변환 → DataFrame

data = {'col1': ['A', 'B', 'C'], 'col2': [1, 2, 3]}
df = pd.DataFrame(data)
print(df)
# 출력 결과
#   col1  col2
# 0    A     1
# 1    B     2
# 2    C     3
Python
복사

데이터 변환 DataFrame → @@@

pd.DataFrame.to_parquet()

pd.DataFrame.to_csv()

pd.DataFrame.to_excel()

pd.DataFrame.to_dict()

df.to_dict()
# {'A': {'row_1': 3, 'row_2': 'a'},
#  'B': {'row_1': 2, 'row_2': 'b'},
#  'C': {'row_1': 1, 'row_2': 'c'},
#  'D': {'row_1': 0, 'row_2': 'd'}}

df.to_dict('series') # Series로 변환
# {'A': row_1    3
#  row_2    a
#  Name: A, dtype: object, 'B': row_1    2
#  row_2    b
#  Name: B, dtype: object, 'C': row_1    1
#  row_2    c
#  Name: C, dtype: object, 'D': row_1    0
#  row_2    d
#  Name: D, dtype: object}

df.to_dict('records') # > json형태
# [{'A': 3, 'B': 2, 'C': 1, 'D': 0}, {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'}]
Python
복사

pd.DataFrame.to_json()

df.to_json()
# '{"A":{"row_1":3,"row_2":"a"},"B":{"row_1":2,"row_2":"b"},"C":{"row_1":1,"row_2":"c"},"D":{"row_1":0,"row_2":"d"}}'

df.to_json(orient="records")
# == df.to_dict('records')과 동일
# '[{"A":3,"B":2,"C":1,"D":0},{"A":"a","B":"b","C":"c","D":"d"}]'
Python
복사

데이터 조작

•

예시 데이터

data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df = pd.DataFrame(data=data)
df

# one	two
# a	1.0	1.0
# b	2.0	2.0
# c	3.0	3.0
# d	NaN	4.0
Python
복사

컬럼 선택

•

one 열의 데이터를 조회

df['one']
     
# a    1.0
# b    2.0
# c    3.0
# d    NaN
# Name: one, dtype: float64
Python
복사

컬럼 조건 선택

•

one 열에서 2보다 큰 데이터의 boolean값 조회

df['one'] > 2
     
# a    False
# b    False
# c     True
# d    False
# Name: one, dtype: bool
Python
복사

•

리스트형태로 만들기

list(df['one'] > 2)
     
[False, False, True, False]
Python
복사

컬럼 추가

•

three 열을 추가하는데 데이터를 one열의 데이터를 복붙

df['three'] = df['one']+df['two']
df

# one	two	three
# a	1.0	1.0	2.0
# b	2.0	2.0	4.0
# c	3.0	3.0	6.0
# d	NaN	4.0	NaN
Python
복사

컬럼 조건 추가

•

flag 열을 추가하는데 조건에 맞는 boolean값 조회

df['flag'] = df['one'] > 2
df
     
# one	two	three	flag
# a	1.0	1.0	2.0	False
# b	2.0	2.0	4.0	False
# c	3.0	3.0	6.0	True
# d	NaN	4.0	NaN	False
# 열은 del / pop 를 사용할 수 있다.
Python
복사

컬럼 삭제

del : 부수효과 O

del df['two']
df
     
# one	three	flag
# a	1.0	2.0	False
# b	2.0	4.0	False
# c	3.0	6.0	True
# d	NaN	NaN	False
Python
복사

pop() : 부수효과 O

three = df.pop("three")
three 
     
# a    2.0
# b    4.0
# c    6.0
# d    NaN
# Name: three, dtype: float64
Python
복사

부수효과 확인

df
     
# one	
# a	1.0
# b	2.0
# c	3.0
# d	NaN
Python
복사

컬럼 추가

df['foo'] = 'bar'
df
# one	flag	foo
# a	1.0	False	bar
# b	2.0	False	bar
# c	3.0	True	bar
# d	NaN	False	bar

df['one_trunc'] = df['one'][:2]
d
# one	flag	foo	one_trunc
# a	1.0	False	bar	1.0
# b	2.0	False	bar	2.0
# c	3.0	True	bar	NaN
# d	NaN	False	bar	NaN

# 특정 위치의 열 추가
df.insert(1, 'bar', df['one'])
df 
# one	bar	flag	foo	one_trunc
# a	1.0	1.0	False	bar	1.0
# b	2.0	2.0	False	bar	2.0
# c	3.0	3.0	True	bar	NaN
# d	NaN	NaN	False	bar	NaN

df['bar'] = df['one']
df
# one	two	three	bar
# a	1.0	1.0	2.0	1.0
# b	2.0	2.0	4.0	2.0
# c	3.0	3.0	6.0	3.0
# d	NaN	4.0	NaN	NaN

df.columns = ['one', 'bar', 'two', 'three']
df
# one	bar	two	three
# a	1.0	1.0	2.0	1.0
# b	2.0	2.0	4.0	2.0
# c	3.0	3.0	6.0	3.0
# d	NaN	4.0	NaN	NaN

df.index, df.columns
# (Index(['a', 'b', 'c', 'd'], dtype='object'),
#  Index(['one', 'bar', 'two', 'three'], dtype='object'))
Python
복사

열이나 행을 선택해 데이터 조회

•

열 선택하기

df['col1']
df[['col1', 'col2']]
Python
복사

•

행 선택하기

df.loc[0]
df.loc[[0, 1, 2]]
Python
복사