컴퓨터공학/인공지능|데이터사이언스
[Data Science] 1. Python Basics For Data Science(NumPy, Pandas)
유(YOO)
2023. 8. 14. 16:52
Link
https://app.datascientist.fr/learn/learning/57/58/168/761
라이브러리 종류
- Numpy : List
- Pandas : Dictionary
NumPy
- Dimension : 행렬 차원
- 행렬 생성
import numpy as np
arr1 = np.array(862) # Create a 0-D array with value 862
arr2 = np.array([1,2,3,4,5]) # Create a 1-D array with value 1,2,3,4,5
arr3 = np.array([[1,2,3],[4,5,6]]) # Create a 2-D array with value 1,2,3 and 4,5,6
arr4 = np.array([[[1,2,3],[4,5,6]],[[1,2,3],[4,5,6]]]) # Create a 3-D array with 2-D array values 1,2,3 and 4,5,6
arr5 = np.array(np.arange(5, dtype=int), dtype=float) # 데이터 타입 변경
- 행렬 정보
a = np.arange(15).reshape(3,5)
print(a) # [[ 0 1 2 3 4] [ 5 6 7 8 9] [10 11 12 13 14]]
print(a.shape) # (3,5)
print(a.ndim) # Number of Dimensions : 2
print(a.dtype.name) # int64
print(a.itemsize) # 8
print(a.size) # 15
print(a.flat) # for문 범위
print(type(a)) # <class 'numpy.ndarray'>
- 특정 행렬 생성
print(np.zeros((3,4))) # 0벡터
print(np.ones((2,3,4), dtype=np.int16))
print(np.empty((2,3)))
print(np.arange(10, 30, 5)) # 간격으로 행렬 생성
from numpy import pi
print(np.sin(np.linspace(0, 2*pi, 100)) # 0부터 2PI까지 100개(개수)로 행렬 생성
- 행렬 연산
A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])
print(A * B)
print(A @ B)
print(np.add(A, B))
print(np.dot(A))
print(np.exp(A))
print(np.sqrt(A))
- 인덱스 접근
print(A[::-1]) # Reverted array
print(A[1, ...]) # x[1, 2, ...] is equivalent to x[1, 2, :, :, :]
print([np.where(A == 5)])
- 행렬 차원 변경
import numpy as np
a = np.arange(5)
b = a.reshape((1,1,-1)) # -1 끝까지
c = a.reshape((1,-1,1))
d = a.reshape((-1,1,1))
e = [x for x in range(27)]
f = np.array(e)
g = f.reshape((3,3,3))
- 타입 변경
import numpy as np
a = np.arange(5)
o = a.astype('int')
- Display Table
import sys
np.set_printoptions(threshold=sys.maxsize) # Force to display the entire table
Pandas
- Series : 1차원
import pandas as pd
x = pd.Series([6,3,4,6]) # List -> Series Conversion
x = pd.Series(np.arange(0, 13, 3), index=["a", "b", "c", "d", "e"]) # Changing label, Custom indexing
x = pd.Series({"a":0.0, "b":1.0})
x = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"], name="somthing")
- Date Series
date_series = pd.date_range(start="05-01-2021", end="01-12-2021")
- Dataframe : 2차원
import pandas as pd
import numpy as np
df = pd.Dataframe(np.random.randn(4,3))
df = pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])
df_sorted = df.sort_values(by = ["d"]) # Sorting
print(df.index)
print(df.columns)
df.pop("three")
df.insert(1, "bar", df["one"])
- Dictionary Dataframe
import pandas as pd
dictionary = {'name': ['Vinay', 'Kushal', 'Aman'],
'age' : [22, 25, 24],
'occ' : ['engineer', 'doctor', 'accountant']}
dataframe = pd.DataFrame(dictionary)
- Function
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
df = pd.read_csv("") # Read file
df.head() # Head : Look at few top rows
df.describe() # Summary : count, mean, std, min, max
df_rank = df.groupby(['rank']) # Group By : Split the data into groups
dr_rank.mean() # Mean : Calculate mean per each group
df_sub = df[df['salary']>120000] # Filtering
df['Salary'] # Slicing single brackets
df[['rank','salary']] # Slicing double brackets
df[10:20] # Selecting rows
df.loc[10:20, ['rank','sex','salary']] # Selecting rows using labels
df.iloc[10:20, [0,3,4,5]] # Selecting rows using index
df_iloc[i] # Seleting rows using row index
df_iloc[:, 0] # Selecting First column
df_iloc[1:3, 0:2]