파이썬 판다스(Pandas) 데이터프레임 데이터 추출

11 minute read

데이터프레임 데이터 추출

import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/jin0choi1216/dataset/main/AAPL.csv", index_col = 0)
df.head(5)
Open High Low Close Adj Close Volume
Date
2012-01-03 14.621429 14.732143 14.607143 14.686786 12.466090 302220800
2012-01-04 14.642857 14.810000 14.617143 14.765714 12.533089 260022000
2012-01-05 14.819643 14.948214 14.738214 14.929643 12.672229 271269600
2012-01-06 14.991786 15.098214 14.972143 15.085714 12.804703 318292800
2012-01-09 15.196429 15.276786 15.048214 15.061786 12.784389 394024400

1. 데이터 추출하기

1) 비교 연산

df[(df["Close"] >= 150) & (df["Open"] >= 160) ]
df.query("(Close >= 150) and (Open >= 160)") # 같은 쿼리
Open High Low Close Adj Close Volume
Date
2021-11-22 161.679993 165.699997 161.000000 161.020004 159.410767 117467900
2021-11-23 161.119995 161.800003 159.059998 161.410004 159.796844 96041900
2021-11-24 160.750000 162.139999 159.639999 161.940002 160.321533 69463600
2021-12-01 167.479996 170.300003 164.529999 164.770004 163.123276 152052500
2021-12-03 164.020004 164.960007 159.720001 161.839996 160.222549 118023100
... ... ... ... ... ... ...
2023-08-21 175.070007 176.130005 173.740005 175.839996 175.839996 46311900
2023-08-22 177.059998 177.679993 176.250000 177.229996 177.229996 42084200
2023-08-23 178.520004 181.550003 178.330002 181.119995 181.119995 52722800
2023-08-24 180.669998 181.100006 176.009995 176.380005 176.380005 54945800
2023-08-25 177.380005 179.149994 175.820007 178.610001 178.610001 51418700

224 rows × 6 columns

2) 질의

 # 문자열 query
df.query("Volume > 100000000")
Open High Low Close Adj Close Volume
Date
2012-01-03 14.621429 14.732143 14.607143 14.686786 12.466090 302220800
2012-01-04 14.642857 14.810000 14.617143 14.765714 12.533089 260022000
2012-01-05 14.819643 14.948214 14.738214 14.929643 12.672229 271269600
2012-01-06 14.991786 15.098214 14.972143 15.085714 12.804703 318292800
2012-01-09 15.196429 15.276786 15.048214 15.061786 12.784389 394024400
... ... ... ... ... ... ...
2023-02-03 148.029999 157.380005 147.830002 154.500000 153.843628 154357300
2023-05-05 170.979996 174.300003 170.759995 173.570007 173.096512 113316400
2023-06-05 182.630005 184.949997 178.039993 179.580002 179.337830 121946500
2023-06-16 186.729996 186.990005 184.270004 184.919998 184.670624 101235600
2023-08-04 185.520004 187.380005 181.919998 181.990005 181.744583 115799700

1997 rows × 6 columns

# 인덱스 컬럼 기준 query
df.query("Date > '2020-01-01'")
Open High Low Close Adj Close Volume
Date
2020-01-02 74.059998 75.150002 73.797501 75.087502 73.249023 135480400
2020-01-03 74.287498 75.144997 74.125000 74.357498 72.536888 146322800
2020-01-06 73.447502 74.989998 73.187500 74.949997 73.114883 118387200
2020-01-07 74.959999 75.224998 74.370003 74.597504 72.771019 108872000
2020-01-08 74.290001 76.110001 74.290001 75.797501 73.941643 132079200
... ... ... ... ... ... ...
2023-08-21 175.070007 176.130005 173.740005 175.839996 175.839996 46311900
2023-08-22 177.059998 177.679993 176.250000 177.229996 177.229996 42084200
2023-08-23 178.520004 181.550003 178.330002 181.119995 181.119995 52722800
2023-08-24 180.669998 181.100006 176.009995 176.380005 176.380005 54945800
2023-08-25 177.380005 179.149994 175.820007 178.610001 178.610001 51418700

919 rows × 6 columns

# 컬럼 값 비교 쿼리
df.query("Open > Close")
Open High Low Close Adj Close Volume
Date
2012-01-09 15.196429 15.276786 15.048214 15.061786 12.784389 394024400
2012-01-10 15.211071 15.214286 15.053571 15.115714 12.830169 258196400
2012-01-11 15.095714 15.101786 14.975357 15.091071 12.809250 215084800
2012-01-12 15.081429 15.103571 14.955357 15.049643 12.774082 212587200
2012-01-19 15.362500 15.406071 15.232500 15.276786 12.966883 261738400
... ... ... ... ... ... ...
2023-08-10 179.479996 180.750000 177.600006 177.970001 177.729996 54686900
2023-08-15 178.880005 179.479996 177.050003 177.449997 177.449997 43622600
2023-08-16 177.130005 178.539993 176.500000 176.570007 176.570007 46964900
2023-08-17 177.139999 177.509995 173.479996 174.000000 174.000000 66062900
2023-08-24 180.669998 181.100006 176.009995 176.380005 176.380005 54945800

1388 rows × 6 columns

# isin 값에 해당하는 인덱스 목록 반환
isin = df["Volume"].isin([394024400, 258196400])
df.query("Volume in [394024400, 258196400]") # 동일한 쿼리
print(f"isin :\n{isin}")
df[isin]
isin :
Date
2012-01-03    False
2012-01-04    False
2012-01-05    False
2012-01-06    False
2012-01-09     True
              ...  
2023-08-21    False
2023-08-22    False
2023-08-23    False
2023-08-24    False
2023-08-25    False
Name: Volume, Length: 2931, dtype: bool
Open High Low Close Adj Close Volume
Date
2012-01-09 15.196429 15.276786 15.048214 15.061786 12.784389 394024400
2012-01-10 15.211071 15.214286 15.053571 15.115714 12.830169 258196400
# 일치하는 시리즈 반환
df.query("Volume == 1000")
Open High Low Close Adj Close Volume
Date
# 변수 치환
value = 1000
df.query("Volume == @value")
df.query(f"Volume == {value}")
Open High Low Close Adj Close Volume
Date

2. 인덱스 컬럼 변환

df.reset_index()
Date Open High Low Close Adj Close Volume
0 2012-01-03 14.621429 14.732143 14.607143 14.686786 12.466090 302220800
1 2012-01-04 14.642857 14.810000 14.617143 14.765714 12.533089 260022000
2 2012-01-05 14.819643 14.948214 14.738214 14.929643 12.672229 271269600
3 2012-01-06 14.991786 15.098214 14.972143 15.085714 12.804703 318292800
4 2012-01-09 15.196429 15.276786 15.048214 15.061786 12.784389 394024400
... ... ... ... ... ... ... ...
2926 2023-08-21 175.070007 176.130005 173.740005 175.839996 175.839996 46311900
2927 2023-08-22 177.059998 177.679993 176.250000 177.229996 177.229996 42084200
2928 2023-08-23 178.520004 181.550003 178.330002 181.119995 181.119995 52722800
2929 2023-08-24 180.669998 181.100006 176.009995 176.380005 176.380005 54945800
2930 2023-08-25 177.380005 179.149994 175.820007 178.610001 178.610001 51418700

2931 rows × 7 columns

3. Apply

df_kospi = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/KOSPI_stocks.csv', index_col = 0)
df_kospi.head()
Code ISU_CD Name Market Dept Close ChangeCode Changes ChagesRatio Open High Low Volume Amount Marcap Stocks MarketId
0 005930 KR7005930003 삼성전자 KOSPI NaN 67100 2 -1100 -1.61 67100 67400 66900 7032462 471934306900 400572409105000 5969782550 STK
1 373220 KR7373220003 LG에너지솔루션 KOSPI NaN 546000 1 11000 2.06 527000 549000 525000 249493 135513119000 127764000000000 234000000 STK
2 000660 KR7000660001 SK하이닉스 KOSPI NaN 116500 2 -4400 -3.64 117400 118300 115300 3533647 412064064200 84812275522500 728002365 STK
3 207940 KR7207940008 삼성바이오로직스 KOSPI NaN 757000 2 -6000 -0.79 755000 764000 754000 23435 17763884000 53878718000000 71174000 STK
4 005490 KR7005490008 POSCO홀딩스 KOSPI NaN 577000 2 -2000 -0.35 567000 583000 561000 904736 519579225000 48797599710000 84571230 STK
# 함수를 매개변수로 받아 호출한다
def checkChangesRatio(val: int) -> str:
    if val > 0:
        return "오른 주식"
    else:
        return "내린 주식"

df_kospi.ChagesRatio.apply(checkChangesRatio)
0      내린 주식
1      오른 주식
2      내린 주식
3      내린 주식
4      내린 주식
       ...  
946    오른 주식
947    오른 주식
948    내린 주식
949    내린 주식
950    내린 주식
Name: ChagesRatio, Length: 951, dtype: object
# 람다 표현식
df_kospi.ChagesRatio.apply(lambda val: "오른 주식" if val > 0 else "내린 주식")
0      내린 주식
1      오른 주식
2      내린 주식
3      내린 주식
4      내린 주식
       ...  
946    오른 주식
947    오른 주식
948    내린 주식
949    내린 주식
950    내린 주식
Name: ChagesRatio, Length: 951, dtype: object

4. With Numpy

import numpy as np
df_kospi["ADD"] = np.where(df_kospi.ChagesRatio > 0, "오른 주식", "내린 주식")
df_kospi
Code ISU_CD Name Market Dept Close ChangeCode Changes ChagesRatio Open High Low Volume Amount Marcap Stocks MarketId TT ADD
0 005930 KR7005930003 삼성전자 KOSPI NaN 67100 2 -1100 -1.61 67100 67400 66900 7032462 471934306900 400572409105000 5969782550 STK 내린 주식 내린 주식
1 373220 KR7373220003 LG에너지솔루션 KOSPI NaN 546000 1 11000 2.06 527000 549000 525000 249493 135513119000 127764000000000 234000000 STK 오른 주식 오른 주식
2 000660 KR7000660001 SK하이닉스 KOSPI NaN 116500 2 -4400 -3.64 117400 118300 115300 3533647 412064064200 84812275522500 728002365 STK 내린 주식 내린 주식
3 207940 KR7207940008 삼성바이오로직스 KOSPI NaN 757000 2 -6000 -0.79 755000 764000 754000 23435 17763884000 53878718000000 71174000 STK 내린 주식 내린 주식
4 005490 KR7005490008 POSCO홀딩스 KOSPI NaN 577000 2 -2000 -0.35 567000 583000 561000 904736 519579225000 48797599710000 84571230 STK 내린 주식 내린 주식
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
946 014915 KR7014911002 성문전자우 KOSPI NaN 5830 1 50 0.87 5780 5830 5780 123 711790 3498000000 600000 STK 오른 주식 오른 주식
947 002787 KR7002782001 진흥기업2우B KOSPI NaN 11170 1 150 1.36 11020 11170 10910 148 1623110 3293005360 294808 STK 오른 주식 오른 주식
948 000227 KR7000222000 유유제약2우B KOSPI NaN 10080 2 -240 -2.33 10090 10090 10080 75 756060 3281644800 325560 STK 내린 주식 내린 주식
949 001525 KR7001521004 동양우 KOSPI NaN 5210 2 -10 -0.19 5220 5220 5210 37 192810 3218378510 617731 STK 내린 주식 내린 주식
950 001527 KR7001522002 동양2우B KOSPI NaN 10150 2 -10 -0.10 10040 10220 10040 53 534860 3131975350 308569 STK 내린 주식 내린 주식

951 rows × 19 columns

5. 값 집계

df_value_count = df_kospi.ADD.value_counts()
print(type(df_value_count))
df_value_count
<class 'pandas.core.series.Series'>





ADD
내린 주식    583
오른 주식    368
Name: count, dtype: int64

6. 순위

df_kospi.Marcap.rank(ascending = False)
0        1.0
1        2.0
2        3.0
3        4.0
4        5.0
       ...  
946    947.0
947    948.0
948    949.0
949    950.0
950    951.0
Name: Marcap, Length: 951, dtype: float64

7. Group by

df_krx = pd.read_csv("https://raw.githubusercontent.com/jin0choi1216/dataset/main/KRX_stocks.csv", index_col=0)
print(df_krx.head())
df_krx[["Market", "ChagesRatio"]].groupby("Market").mean()
     Code        ISU_CD      Name Market Dept   Close  ChangeCode  Changes  \
0  005930  KR7005930003      삼성전자  KOSPI  NaN   67100           2    -1100   
1  373220  KR7373220003  LG에너지솔루션  KOSPI  NaN  546000           1    11000   
2  000660  KR7000660001    SK하이닉스  KOSPI  NaN  116500           2    -4400   
3  207940  KR7207940008  삼성바이오로직스  KOSPI  NaN  757000           2    -6000   
4  005490  KR7005490008  POSCO홀딩스  KOSPI  NaN  577000           2    -2000   

   ChagesRatio    Open    High     Low   Volume        Amount  \
0        -1.61   67100   67400   66900  7032462  471934306900   
1         2.06  527000  549000  525000   249493  135513119000   
2        -3.64  117400  118300  115300  3533647  412064064200   
3        -0.79  755000  764000  754000    23435   17763884000   
4        -0.35  567000  583000  561000   904736  519579225000   

            Marcap      Stocks MarketId  
0  400572409105000  5969782550      STK  
1  127764000000000   234000000      STK  
2   84812275522500   728002365      STK  
3   53878718000000    71174000      STK  
4   48797599710000    84571230      STK  
ChagesRatio
Market
KONEX -0.168281
KOSDAQ -0.391093
KOSDAQ GLOBAL -0.813600
KOSPI 0.107560
df_krx[["Market", "ChagesRatio", "Open"]].groupby("Market").agg({"ChagesRatio": "mean", "Open": "sum"})
ChagesRatio Open
Market
KONEX -0.168281 624219
KOSDAQ -0.391093 17308486
KOSDAQ GLOBAL -0.813600 2906500
KOSPI 0.107560 34526986
df_krx[["Market", "ChagesRatio", "Open"]].groupby("Market").agg({"ChagesRatio": ["max", "sum"], "Open": ["min", "mean"]})
ChagesRatio Open
max sum min mean
Market
KONEX 14.89 -21.54 0 4876.710938
KOSDAQ 29.99 -633.18 0 10690.849907
KOSDAQ GLOBAL 13.70 -40.68 7170 58130.000000
KOSPI 30.00 102.29 0 36305.978970

8. Merge

market_price_change = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/market_price_change_2022.csv')
print(market_price_change.head(2))
print(market_price_change.shape)
purchases_of_equities = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/purchases_of_equities_2022.csv')
print(purchases_of_equities.head(2))
print(purchases_of_equities.shape)

# 컬럼 이름이 같을 경우

# inner
merged_inner = pd.merge(left = market_price_change, right = purchases_of_equities, how = "inner", on = ["티커", "종목명"])
print(f"inner shape : {merged_inner.shape}")

# left
merged_left = pd.merge(left = market_price_change, right = purchases_of_equities, how = "left", on = ["티커", "종목명"])
print(f"left shape : {merged_left.shape}")

# right
merged_right = pd.merge(left = market_price_change, right = purchases_of_equities, how = "right", on = ["티커", "종목명"])
print(f"right shape : {merged_right.shape}")

# outer
merged_outer = pd.merge(left = market_price_change, right = purchases_of_equities, how = "outer", on = ["티커", "종목명"])
print(f"outer shape : {merged_outer.shape}")
       티커     종목명     시가     종가   변동폭    등락률      거래량         거래대금
0  095570  AJ네트웍스   6110   5720  -390  -6.38  2947645  17162545200
1  006840   AK홀딩스  15150  17200  2050  13.53   455492   7706148650
(942, 8)
       티커     종목명    매도거래량    매수거래량  순매수거래량       매도거래대금       매수거래대금  순매수거래대금
0  060310      3S  3318765  3318765       0   7721430565   7721430565        0
1  095570  AJ네트웍스  2947645  2947645       0  17162545200  17162545200        0
(2625, 8)
inner shape : (916, 14)
left shape : (942, 14)
right shape : (2625, 14)
outer shape : (2651, 14)
# 컬럼 이름이 다를 경우
pur2 = purchases_of_equities.rename(columns = { "티커": "티커2", "종목명": "종목명2" })

diff_column = pd.merge(
    left = market_price_change,
    right = pur2,
    how = "inner",
    left_on = ["티커", "종목명"],
    right_on = ["티커2", "종목명2"],
    suffixes = ("_L", "_R")
)
diff_column
티커 종목명 시가 종가 변동폭 등락률 거래량 거래대금 티커2 종목명2 매도거래량 매수거래량 순매수거래량 매도거래대금 매수거래대금 순매수거래대금
0 095570 AJ네트웍스 6110 5720 -390 -6.38 2947645 17162545200 095570 AJ네트웍스 2947645 2947645 0 17162545200 17162545200 0
1 006840 AK홀딩스 15150 17200 2050 13.53 455492 7706148650 006840 AK홀딩스 455492 455492 0 7706148650 7706148650 0
2 027410 BGF 4365 4305 -60 -1.37 14388855 62685829240 027410 BGF 14388855 14388855 0 62685829240 62685829240 0
3 282330 BGF리테일 205500 210500 5000 2.43 712994 146603104000 282330 BGF리테일 712994 712994 0 146603104000 146603104000 0
4 138930 BNK금융지주 7390 6500 -890 -12.04 37130723 264104001940 138930 BNK금융지주 37130723 37130723 0 264104001940 264104001940 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
911 005010 휴스틸 5420 5070 -350 -6.46 9608972 49226534830 005010 휴스틸 9608972 9608972 0 49226534830 49226534830 0
912 000540 흥국화재 3165 3370 205 6.48 15972406 59888402133 000540 흥국화재 15972406 15972406 0 59888402133 59888402133 0
913 000547 흥국화재2우B 19300 16200 -3100 -16.06 18918 364549550 000547 흥국화재2우B 18918 18918 0 364549550 364549550 0
914 000545 흥국화재우 6020 6150 130 2.16 68957 432345040 000545 흥국화재우 68957 68957 0 432345040 432345040 0
915 003280 흥아해운 1625 1355 -270 -16.62 4552374 6915159460 003280 흥아해운 4552374 4552374 0 6915159460 6915159460 0

916 rows × 16 columns

9. Pivot Table

# 2012 ~ 2023
GOOGL = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/GOOGL.csv')
AAPL = pd.read_csv('https://raw.githubusercontent.com/jin0choi1216/dataset/main/AAPL.csv')

GOOGL["Symbol"] = "Google"
AAPL["Symbol"] = "Apple"

df = pd.concat([GOOGL, AAPL])
# 날짜를 기준으로 구글과 애플별로 피벗
pivot = pd.pivot_table(data = df, index = "Date", columns = "Symbol", values = "Close")
print(pivot)

# 인덱스 여러개 지정
pivots = pd.pivot_table(data = df, index = ["Date", "Symbol"], values = "Close", aggfunc = "mean")
print(pivots)
Symbol           Apple      Google
Date                              
2012-01-03   14.686786   16.651901
2012-01-04   14.765714   16.723724
2012-01-05   14.929643   16.491741
2012-01-06   15.085714   16.266768
2012-01-09   15.061786   15.577077
...                ...         ...
2023-08-21  175.839996  128.369995
2023-08-22  177.229996  129.080002
2023-08-23  181.119995  132.369995
2023-08-24  176.380005  129.779999
2023-08-25  178.610001  129.880005

[2931 rows x 2 columns]
                        Close
Date       Symbol            
2012-01-03 Apple    14.686786
           Google   16.651901
2012-01-04 Apple    14.765714
           Google   16.723724
2012-01-05 Apple    14.929643
...                       ...
2023-08-23 Google  132.369995
2023-08-24 Apple   176.380005
           Google  129.779999
2023-08-25 Apple   178.610001
           Google  129.880005

[5862 rows x 1 columns]

9. 이전 값 비교

# 이전 값 과의 차이
GOOGL.Low.diff()

# 이전 값 과의 차이 퍼센트 수치
GOOGL.Low.pct_change()
0            NaN
1       0.012646
2      -0.006645
3      -0.009814
4      -0.043953
          ...   
2926    0.001424
2927    0.013907
2928    0.012079
2929   -0.002310
2930   -0.017905
Name: Low, Length: 2931, dtype: float64

10. 결측치 수치 확인

is_null = merged_outer.isnull()
print(is_null.head(5))
print(is_null.sum())
      티커    종목명     시가     종가    변동폭    등락률    거래량   거래대금  매도거래량  매수거래량  \
0  False  False  False  False  False  False  False  False  False  False   
1  False  False  False  False  False  False  False  False  False  False   
2  False  False  False  False  False  False  False  False  False  False   
3  False  False  False  False  False  False  False  False  False  False   
4  False  False  False  False  False  False  False  False  False  False   

   순매수거래량  매도거래대금  매수거래대금  순매수거래대금  
0   False   False   False    False  
1   False   False   False    False  
2   False   False   False    False  
3   False   False   False    False  
4   False   False   False    False  
티커            0
종목명           0
시가         1709
종가         1709
변동폭        1709
등락률        1709
거래량        1709
거래대금       1709
매도거래량        26
매수거래량        26
순매수거래량       26
매도거래대금       26
매수거래대금       26
순매수거래대금      26
dtype: int64

11. 데이터 밀기

GOOGL["Low_2"] = GOOGL.Low.shift(1, fill_value = 9999) 
GOOGL
Date Open High Low Close Adj Close Volume Symbol Low_2
0 2012-01-03 16.339840 16.720470 16.325577 16.651901 16.651901 146912940 Google 9999.000000
1 2012-01-04 16.642391 16.773024 16.532032 16.723724 16.723724 114445440 Google 16.325577
2 2012-01-05 16.569820 16.615866 16.422173 16.491741 16.491741 131184684 Google 16.532032
3 2012-01-06 16.495245 16.516518 16.261011 16.266768 16.266768 107608284 Google 16.422173
4 2012-01-09 16.178679 16.191191 15.546296 15.577077 15.577077 232671096 Google 16.261011
... ... ... ... ... ... ... ... ... ...
2926 2023-08-21 127.180000 128.729996 126.559998 128.369995 128.369995 25248700 Google 126.379997
2927 2023-08-22 128.509995 130.279999 128.320007 129.080002 129.080002 22067500 Google 126.559998
2928 2023-08-23 130.179993 133.410004 129.869995 132.369995 132.369995 27819700 Google 128.320007
2929 2023-08-24 133.949997 134.250000 129.570007 129.779999 129.779999 28500700 Google 129.869995
2930 2023-08-25 129.539993 130.759995 127.250000 129.880005 129.880005 26744800 Google 129.570007

2931 rows × 9 columns

12. 이동 평균, 누적 평균

expanding = GOOGL.Low.expanding() # 누적 평균 
print(expanding.mean())
rolling = GOOGL.Low.rolling(window = 5) # 이동평균
print(rolling.mean())
0       16.325577
1       16.428804
2       16.426594
3       16.385198
4       16.217418
          ...    
2926    57.929351
2927    57.953392
2928    57.977945
2929    58.002379
2930    58.026005
Name: Low, Length: 2931, dtype: float64
0              NaN
1              NaN
2              NaN
3              NaN
4        16.217418
           ...    
2926    127.875998
2927    127.684000
2928    128.083998
2929    128.140001
2930    128.314001
Name: Low, Length: 2931, dtype: float64

Comments