Input, Subset and Output External Data Files using Pandas
Introduction#
This section shows basic code for reading, sub-setting and writing external data files using pandas.
Basic Code to Import, Subset and Write External Data Files Using Pandas
# Print the working directory
import os
print os.getcwd()
# C:\Python27\Scripts
# Set the working directory
os.chdir('C:/Users/general1/Documents/simple Python files')
print os.getcwd()
# C:\Users\general1\Documents\simple Python files
# load pandas
import pandas as pd
# read a csv data file named 'small_dataset.csv' containing 4 lines and 3 variables
my_data = pd.read_csv("small_dataset.csv")
my_data
# x y z
# 0 1 2 3
# 1 4 5 6
# 2 7 8 9
# 3 10 11 12
my_data.shape # number of rows and columns in data set
# (4, 3)
my_data.shape[0] # number of rows in data set
# 4
my_data.shape[1] # number of columns in data set
# 3
# Python uses 0-based indexing. The first row or column in a data set is located
# at position 0. In R the first row or column in a data set is located
# at position 1.
# Select the first two rows
my_data[0:2]
# x y z
#0 1 2 3
#1 4 5 6
# Select the second and third rows
my_data[1:3]
# x y z
# 1 4 5 6
# 2 7 8 9
# Select the third row
my_data[2:3]
# x y z
#2 7 8 9
# Select the first two elements of the first column
my_data.iloc[0:2, 0:1]
# x
# 0 1
# 1 4
# Select the first element of the variables y and z
my_data.loc[0, ['y', 'z']]
# y 2
# z 3
# Select the first three elements of the variables y and z
my_data.loc[0:2, ['y', 'z']]
# y z
# 0 2 3
# 1 5 6
# 2 8 9
# Write the first three elements of the variables y and z
# to an external file. Here index = 0 means do not write row names.
my_data2 = my_data.loc[0:2, ['y', 'z']]
my_data2.to_csv('my.output.csv', index = 0)