import pandas as pd


df=pd.read_csv( "C:/Acqrote/DataFrame/OnlineSalesData.csv")

df


df.columns

Index(['Transaction ID', 'Date', 'Product Category', 'Product Name',
       'Units Sold', 'Unit Price', 'Total Revenue', 'Region',
       'Payment Method'],
      dtype='object')


def getDataFrameSize(df: pd.DataFrame) -> list[int]:
    (c,d)=df.shape
    return [c, d]

getDataFrameSize(df)

[240, 9]


def selectData(df: pd.DataFrame, colName:str, product:str, colList:list) -> pd.DataFrame:
    return df[df[colName]==product][colList].sort_values(by=['Date']).tail(6)

colList=['Product Category', 'Product Name', 'Unit Price','Units Sold', 'Date']

electronicsSales=selectData(df, 'Product Category', 'Electronics', colList)
electronicsSales


def checkTotalRevenue(B: pd.DataFrame, colName:list) -> pd.DataFrame:
    A=B.copy()
    A['Revenue Check']=(A['Unit Price']*A['Units Sold']-A['Total Revenue']).map(insertRevenue_label)
    
    return A[colName].groupby('Revenue Check').size().reset_index(name='Result Label Count')

def insertRevenue_label(delta):
    if delta<0.01:
        return 'OK'
    else:
        return 'Mistake'
    
colName=['Revenue Check']


[TotalSalesQty, col]=getDataFrameSize(df)
print("Total Sales Lineitems:   ", TotalSalesQty)
checkTotalRevenue(df, colName)

Total Sales Lineitems:    240


def dropDuplicateTransactions(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates(keep='first', subset=['Transaction ID'])

[TransactionQtyBefore, col]=getDataFrameSize(df)

[TransactionQtyAfter, col]=getDataFrameSize(dropDuplicateTransactions(df))

print("Transaction quantities in 'dropDuplicateTransaction'\n", 
"[before: ", TransactionQtyBefore, "]   versus   [after: ", TransactionQtyAfter,"]")

Transaction quantities in 'dropDuplicateTransaction'
 [before:  240 ]   versus   [after:  240 ]


def dropMissingData(df: pd.DataFrame) -> pd.DataFrame:
    
    conditions = [
    df['Units Sold'].isna(),
    df['Total Revenue'].isna()
    ]

# Combine conditions using logical AND
    final_mask = conditions[0] & conditions[1]

    return df[~final_mask]


[TransactionQtyBefore, col]=getDataFrameSize(df)

[TransactionQtyAfter, col]=getDataFrameSize(dropMissingData(df))

print("Transaction quantities in 'dropMissingData'\n",
    "[before: ", TransactionQtyBefore, "]   versus   [after: ", TransactionQtyAfter,"]")

Transaction quantities in 'dropMissingData'
 [before:  240 ]   versus   [after:  240 ]


def pivotTable(df: pd.DataFrame, start_date: str, end_date: str) -> pd.DataFrame:
    productDF=df.pivot(index='Date', columns='Product Category', values='Total Revenue')
    
    return productDF[start_date:end_date]

BacktoSchoolDF=pivotTable(df, '2024-07-15', '2024-08-15')
BacktoSchoolDF


def meltTable(df: pd.DataFrame) -> pd.DataFrame:
    df.reset_index(inplace=True)   ### Make sure the ['Date'] is a column, not an index column
    meltProduct=df.melt(id_vars='Date', var_name='Product Category', value_name='Total Revenue')
    return meltProduct.dropna(subset = ['Total Revenue']).sort_values(by=['Date']).reset_index(drop=True)

BacktoSchoolProducts=meltTable(BacktoSchoolDF)
BacktoSchoolProducts


def aggregateUnitPrice(B:pd.DataFrame, lowPrice:float, medPrice:float, highPrice:float)->pd.DataFrame:
    A=B.copy()
    # Define the bins
    bins = [0,lowPrice, medPrice, highPrice]
    labels = ['Low Price (<=$'+str(lowPrice)+')', 'Medium Price', 'High Price (>$'+str(highPrice)+')']

    # Create a new column with the group labels
    A['Price Group'] = pd.cut(A['Unit Price'], bins=bins, labels=labels)
    
    # Aggregate the values based on the groups
    grouped = A.groupby('Price Group', )[['Units Sold']].sum()
    
    return grouped 

aggregateUnitPrice(df, 1750.00, 3500.00, 5000.00)


def region_payment(df: pd.DataFrame, region:str, payment:str):
    regionSales= df.loc[((df['Region'] ==region) & (df['Payment Method']==payment)), \
    ['Product Category', 'Units Sold', 'Region', 'Payment Method']]
    
    display=regionSales.groupby(['Product Category', 'Region', 'Payment Method'])[['Units Sold']].sum()
    return display

region_payment(df, 'Europe', 'PayPal')


def invalid_productnames(df: pd.DataFrame, charlen:int) -> pd.DataFrame:     
      return df.loc[df['Product Name'].str.len()>charlen, ['Transaction ID','Product Name'] ]
    
invalid_productnames(df, 50)


def checkEuropeRegion(region:str, revenue:float, exchangeRate:float):
    if region=='Europe':
        return revenue*exchangeRate
    else:
        return 'N/A'
def exchangeCurrency(B: pd.DataFrame)-> pd.DataFrame:
    A=B.copy()
    A['Total Revenue (Euro)']=A.apply(lambda x: checkEuropeRegion(region=x['Region'], revenue=x['Total Revenue'], exchangeRate=0.97), axis=1)
    result=A[['Region', 'Total Revenue', 'Total Revenue (Euro)']]
    return result[result['Region']=='Europe'].sort_values('Total Revenue (Euro)', ascending=False).head(6)

exchangeCurrency(df)


def maxRevenue_ProductCategory(df:pd.DataFrame)-> pd.DataFrame:
    highest_revenue = df.groupby('Product Category').apply(lambda x: x[x['Total Revenue'] == x['Total Revenue'].max()])
    return highest_revenue[['Product Category', 'Product Name', 'Unit Price', 'Units Sold', 'Total Revenue', 'Region']].sort_values('Total Revenue', ascending=False)

maxRevenue_ProductCategory(df)


def rankRevenue_PaymentMethod(B:pd.DataFrame, paymentMethod:str, productCat:str)->pd.DataFrame:
    A=B.copy()
    A['Revenue Rank']= A.groupby(['Payment Method', 'Product Category'])['Total Revenue'].rank(method='dense', ascending=False)
    result=A[['Payment Method', 'Total Revenue', 'Revenue Rank', 'Product Category', 'Region']].sort_values(by=['Payment Method', 'Revenue Rank'], ascending=True)
    return result[(result['Payment Method']==paymentMethod) & (result['Product Category']==productCat)].head(10)
    
rankRevenue_PaymentMethod(df, 'PayPal', 'Home Appliances')


def topProductSold(df:pd.DataFrame, qty:int)->pd.DataFrame:
    top = df.sort_values(['Units Sold', 'Product Category'], ascending=False).groupby('Product Category').head(qty).reset_index(drop=True)
    
    result=top [['Product Category', 'Units Sold', 'Region', 'Payment Method']]
    result=result.rename(columns={'Product Category': 'Product Category (Top '+str(qty)+' groups)'})
    return result
topProductSold(df, 2)


def count_unique_subjects(df: pd.DataFrame) -> pd.DataFrame:
    result=    df.groupby(['Region'])['Product Category'].nunique().reset_index()
    
    result=result.rename(columns={'Product Category':'Unique Category Count'})
    return result

count_unique_subjects(df)


def filteredAveRevenueRegion(df:pd.DataFrame, region:str, average:float)->pd.DataFrame:
    
    result=df.groupby('Product Category').filter(lambda x: x['Total Revenue'].mean()>average).sort_values(by='Region')[['Region', 'Product Category', 'Product Name', 'Total Revenue' ]]
    return result[result['Region']==region]

filteredAveRevenueRegion(df, 'Asia', 210.00)


def mostExpensiveProduct(df: pd.DataFrame):
    result= df.groupby(['Product Category' ])[['Unit Price']].max()
    result.rename(columns={'Unit Price':'Max Unit Price'}, inplace=True)
   
    return result

mostExpensiveProduct(df)


def aggProductbyRevenueDates(df:pd.DataFrame)->pd.DataFrame:

    return df.groupby('Product Category').agg(
        Revenue=('Total Revenue', 'sum'),
        Dates=('Date', lambda x: ',   '.join(sorted(x.unique())))
    ).reset_index()

aggProductbyRevenueDates(df)


ProductCatalog=df[['Product Category', 'Product Name', 'Unit Price']]
ProductSales=df[['Transaction ID', 'Date', 'Product Name', 'Units Sold', 'Region', 'Payment Method']]


def mergeTwoTables(dfa:pd.DataFrame, dfb:pd.DataFrame)->pd.DataFrame:
    colName=['Transaction ID', 'Date', 'Product Category', 'Product Name', 'Unit Price', 'Units Sold', 'Region', 'Payment Method']
    result=pd.merge(dfa, dfb, left_on='Product Name', right_on='Product Name', how='inner')[colName].drop_duplicates(keep='first', subset='Transaction ID')
    result['Total Revenue']=result['Unit Price']*result['Units Sold']
    return result

print('Shape of First Table: ', ProductCatalog.shape)
print('Shape of Second Table: ', ProductSales.shape)

finalTable=mergeTwoTables(ProductSales, ProductCatalog)
print(finalTable.shape)
finalTable.head(6)

Shape of First Table:  (240, 3)
Shape of Second Table:  (240, 6)
(240, 9)


def ExcludeItems(dfA:pd.DataFrame, dfB:pd.DataFrame, colName:str )->pd.DataFrame:
    return dfA[~dfA['Date'].isin(dfB['Date'])].reset_index()

ExcludeItems(df, BacktoSchoolDF, 'Date')


print(df.columns)
print(df.shape)

Index(['Transaction ID', 'Date', 'Product Category', 'Product Name',
       'Units Sold', 'Unit Price', 'Total Revenue', 'Region',
       'Payment Method'],
      dtype='object')
(240, 9)

	Transaction ID	Date	Product Category	Product Name	Units Sold	Unit Price	Total Revenue	Region	Payment Method
0	10001	2024-01-01	Electronics	iPhone 14 Pro	2	999.99	1999.98	North America	Credit Card
1	10002	2024-01-02	Home Appliances	Dyson V11 Vacuum	1	499.99	499.99	Europe	PayPal
2	10003	2024-01-03	Clothing	Levi's 501 Jeans	3	69.99	209.97	Asia	Debit Card
3	10004	2024-01-04	Books	The Da Vinci Code	4	15.99	63.96	North America	Credit Card
4	10005	2024-01-05	Beauty Products	Neutrogena Skincare Set	1	89.99	89.99	Europe	PayPal
...	...	...	...	...	...	...	...	...	...
235	10236	2024-08-23	Home Appliances	Nespresso Vertuo Next Coffee and Espresso Maker	1	159.99	159.99	Europe	PayPal
236	10237	2024-08-24	Clothing	Nike Air Force 1 Sneakers	3	90.00	270.00	Asia	Debit Card
237	10238	2024-08-25	Books	The Handmaid's Tale by Margaret Atwood	3	10.99	32.97	North America	Credit Card
238	10239	2024-08-26	Beauty Products	Sunday Riley Luna Sleeping Night Oil	1	55.00	55.00	Europe	PayPal
239	10240	2024-08-27	Sports	Yeti Rambler 20 oz Tumbler	2	29.99	59.98	Asia	Credit Card

	Product Category	Product Name	Unit Price	Units Sold	Date
204	Electronics	Samsung Galaxy Watch 4	299.99	1	2024-07-23
210	Electronics	Canon EOS Rebel T7i DSLR Camera	749.99	1	2024-07-29
216	Electronics	Apple MacBook Pro 16-inch	2399.00	1	2024-08-04
222	Electronics	Amazon Echo Dot (4th Gen)	49.99	4	2024-08-10
228	Electronics	Google Nest Wifi Router	169.00	1	2024-08-16
234	Electronics	Bose QuietComfort 35 II Wireless Headphones	299.00	1	2024-08-22

Product Category	Beauty Products	Books	Clothing	Electronics	Home Appliances	Sports
Date
2024-07-15	125.0	NaN	NaN	NaN	NaN	NaN
2024-07-16	NaN	NaN	NaN	NaN	NaN	449.99
2024-07-17	NaN	NaN	NaN	358.00	NaN	NaN
2024-07-18	NaN	NaN	NaN	NaN	99.95	NaN
2024-07-19	NaN	NaN	179.97	NaN	NaN	NaN
2024-07-20	NaN	29.98	NaN	NaN	NaN	NaN
2024-07-21	52.0	NaN	NaN	NaN	NaN	NaN
2024-07-22	NaN	NaN	NaN	NaN	NaN	399.99
2024-07-23	NaN	NaN	NaN	299.99	NaN	NaN
2024-07-24	NaN	NaN	NaN	NaN	379.99	NaN
2024-07-25	NaN	NaN	196.00	NaN	NaN	NaN
2024-07-26	NaN	50.97	NaN	NaN	NaN	NaN
2024-07-27	79.0	NaN	NaN	NaN	NaN	NaN
2024-07-28	NaN	NaN	NaN	NaN	NaN	129.00
2024-07-29	NaN	NaN	NaN	749.99	NaN	NaN
2024-07-30	NaN	NaN	NaN	NaN	339.98	NaN
2024-07-31	NaN	NaN	39.60	NaN	NaN	NaN
2024-08-01	NaN	32.97	NaN	NaN	NaN	NaN
2024-08-02	58.0	NaN	NaN	NaN	NaN	NaN
2024-08-03	NaN	NaN	NaN	NaN	NaN	349.99
2024-08-04	NaN	NaN	NaN	2399.00	NaN	NaN
2024-08-05	NaN	NaN	NaN	NaN	449.99	NaN
2024-08-06	NaN	NaN	149.97	NaN	NaN	NaN
2024-08-07	NaN	25.98	NaN	NaN	NaN	NaN
2024-08-08	27.0	NaN	NaN	NaN	NaN	NaN
2024-08-09	NaN	NaN	NaN	NaN	NaN	599.99
2024-08-10	NaN	NaN	NaN	199.96	NaN	NaN
2024-08-11	NaN	NaN	NaN	NaN	459.98	NaN
2024-08-12	NaN	NaN	89.98	NaN	NaN	NaN
2024-08-13	NaN	80.97	NaN	NaN	NaN	NaN
2024-08-14	6.7	NaN	NaN	NaN	NaN	NaN
2024-08-15	NaN	NaN	NaN	NaN	NaN	299.90

	Date	Product Category	Total Revenue
0	2024-07-15	Beauty Products	125.00
1	2024-07-16	Sports	449.99
2	2024-07-17	Electronics	358.00
3	2024-07-18	Home Appliances	99.95
4	2024-07-19	Clothing	179.97
5	2024-07-20	Books	29.98
6	2024-07-21	Beauty Products	52.00
7	2024-07-22	Sports	399.99
8	2024-07-23	Electronics	299.99
9	2024-07-24	Home Appliances	379.99
10	2024-07-25	Clothing	196.00
11	2024-07-26	Books	50.97
12	2024-07-27	Beauty Products	79.00
13	2024-07-28	Sports	129.00
14	2024-07-29	Electronics	749.99
15	2024-07-30	Home Appliances	339.98
16	2024-07-31	Clothing	39.60
17	2024-08-01	Books	32.97
18	2024-08-02	Beauty Products	58.00
19	2024-08-03	Sports	349.99
20	2024-08-04	Electronics	2399.00
21	2024-08-05	Home Appliances	449.99
22	2024-08-06	Clothing	149.97
23	2024-08-07	Books	25.98
24	2024-08-08	Beauty Products	27.00
25	2024-08-09	Sports	599.99
26	2024-08-10	Electronics	199.96
27	2024-08-11	Home Appliances	459.98
28	2024-08-12	Clothing	89.98
29	2024-08-13	Books	80.97
30	2024-08-14	Beauty Products	6.70
31	2024-08-15	Sports	299.90

	Transaction ID	Product Name
111	10112	Sapiens: A Brief History of Humankind by Yuval...
142	10143	Clinique Dramatically Different Moisturizing L...
172	10173	Paula's Choice Skin Perfecting 2% BHA Liquid E...
202	10203	Dr. Jart+ Cicapair Tiger Grass Color Correctin...

Pandas DataFrame Operations¶

Manage the online sales data¶

Get column names¶

Get DataFrame size¶

Select a subset¶

Check the accuracy of the column 'Total Revenue'¶

Drop duplicated transactions¶

Remove rows with missing values which cannot be recovered from other data¶

Total Revenue per Product Category during the 'back-to-school event'¶

Pivot a DataFrame¶

Compress 'Product Category' in multiple columns and list 'Total Revenue' in one column¶

Melt a DataFrame¶

Count 'Units Sold' by Price Group¶

Find products which were sold in Europe and paid via Paypal¶

Find lengthy product names¶

Add a column to display 'Total Revenue' in Europe using the currency 'euros'¶

Find the transactions of highest revenue in each product category¶

Rank the total revenue by product category in a specific payment method¶

Find the top sold quantity per day of each product category¶

Count the quantities of unique product category at each region¶

Find the average revenue > a specified value in each region¶

Find most expensive product in each product category¶

Aggregate Product Category, Display 'Revenue sum' and 'Dates' in each group¶

Divide the dataframe into two subsets: Product Catalog and Product Sales¶

Combine two dataframes into one dataframe and add a column 'Total Revenue'¶

Show the dataframe which excludes the BacktoSchool items¶

End of File¶

	Units Sold
Price Group
Low Price (<=$1750.0)	514
Medium Price	3
High Price (>$5000.0)	1

	Region	Total Revenue	Total Revenue (Euro)
85	Europe	2599.98	2521.9806
37	Europe	1599.98	1551.9806
7	Europe	1199.98	1163.9806
73	Europe	899.99	872.9903
127	Europe	799.98	775.9806
103	Europe	699.98	678.9806

	Region	Product Category	Product Name	Total Revenue
119	Asia	Sports	YETI Hopper Flip Portable Cooler	249.99
77	Asia	Sports	Rogue Fitness Kettlebell	209.97
83	Asia	Sports	Spalding NBA Street Basketball	149.94
89	Asia	Sports	On Running Cloud Shoes	259.98
95	Asia	Sports	Garmin Fenix 6X Pro	999.99
101	Asia	Sports	Under Armour HOVR Sonic 4 Shoes	219.98
107	Asia	Sports	Bowflex SelectTech 552 Dumbbells	399.99
113	Asia	Sports	Fitbit Versa 3	689.85
125	Asia	Sports	Yeti Roadie 24 Cooler	199.99
131	Asia	Sports	Hydro Flask Wide Mouth Water Bottle	159.80
137	Asia	Sports	Fitbit Inspire 2	199.90
143	Asia	Sports	YETI Tundra 45 Cooler	299.99
149	Asia	Sports	Bose SoundSport Wireless Earbuds	299.98
71	Asia	Sports	Nike Metcon 6	389.97
155	Asia	Sports	Garmin Forerunner 245	299.99
167	Asia	Sports	Yeti Rambler Bottle	149.97
173	Asia	Sports	Bowflex SelectTech 1090 Adjustable Dumbbells	699.99
179	Asia	Sports	Oakley Holbrook Sunglasses	146.00
185	Asia	Sports	Polar Vantage V2	499.95
191	Asia	Sports	TRX All-in-One Suspension Training System	169.95
197	Asia	Sports	GoPro HERO9 Black	449.99
203	Asia	Sports	Yeti Tundra Haul Portable Wheeled Cooler	399.99
209	Asia	Sports	Bose SoundLink Color Bluetooth Speaker II	129.00
215	Asia	Sports	YETI Tundra 65 Cooler	349.99
221	Asia	Sports	Garmin Forerunner 945	599.99
227	Asia	Sports	Fitbit Luxe	299.90
233	Asia	Sports	Hydro Flask Standard Mouth Water Bottle	98.85
161	Asia	Sports	TriggerPoint GRID Foam Roller	69.98
65	Asia	Sports	Garmin Edge 530	599.98
239	Asia	Sports	Yeti Rambler 20 oz Tumbler	59.98
35	Asia	Sports	Peloton Bike	1895.00
17	Asia	Sports	Manduka PRO Yoga Mat	479.96
11	Asia	Sports	Babolat Pure Drive Tennis Racket	599.97
23	Asia	Sports	Adidas FIFA World Cup Football	89.97
47	Asia	Sports	Yeti Rambler Tumbler	239.94
5	Asia	Sports	Wilson Evolution Basketball	149.95
53	Asia	Sports	Titleist Pro V1 Golf Balls	249.95
41	Asia	Sports	Fitbit Charge 5	259.98
59	Asia	Sports	Hyperice Hypervolt Massager	349.00
29	Asia	Sports	Nike Air Zoom Pegasus 37	259.98

	Product Category	Revenue	Dates
0	Beauty Products	2621.90	2024-01-05, 2024-01-11, 2024-01-17, 2024...
1	Books	1861.93	2024-01-04, 2024-01-10, 2024-01-16, 2024...
2	Clothing	8128.93	2024-01-03, 2024-01-09, 2024-01-15, 2024...
3	Electronics	34982.41	2024-01-01, 2024-01-07, 2024-01-13, 2024...
4	Home Appliances	18646.16	2024-01-02, 2024-01-08, 2024-01-14, 2024...
5	Sports	14326.52	2024-01-06, 2024-01-12, 2024-01-18, 2024...