Saltar a contenido

Pandas vs Polars vs DuckDB

EuropeSales.csv

Enlace descarga

pip install polars
import pandas as pd
import polars as pl
import duckdb

Carga de datos (CSVs)

Pandas

df_pd = pd.read_csv("EuropeSales.csv")
df_pd.head(3)
Region Country Item Type Sales Channel Order Priority Order Date Order ID Ship Date Units Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
0 Europe Czech Republic Beverages Offline C 9/12/2011 478051030 9/29/2011 4778 47.45 31.79 226716.10 151892.62 74823.48
1 Europe Bosnia and Herzegovina Clothes Online M 10/14/2013 919133651 11/4/2013 927 109.28 35.84 101302.56 33223.68 68078.88
2 Europe Austria Cereal Offline C 8/13/2014 987410676 9/6/2014 5616 205.70 117.11 1155211.20 657689.76 497521.44

Polars

df_pl = pl.read_csv("EuropeSales.csv")
df_pl.head(3)
RegionCountryItem TypeSales ChannelOrder PriorityOrder DateOrder IDShip DateUnits SoldUnit PriceUnit CostTotal RevenueTotal CostTotal Profit
strstrstrstrstrstri64stri64f64f64f64f64f64
"Europe""Czech Republic""Beverages""Offline""C""9/12/2011"478051030"9/29/2011"477847.4531.79226716.1151892.6274823.48
"Europe""Bosnia and Herzegovina""Clothes""Online""M""10/14/2013"919133651"11/4/2013"927109.2835.84101302.5633223.6868078.88
"Europe""Austria""Cereal""Offline""C""8/13/2014"987410676"9/6/2014"5616205.7117.111155211.2657689.76497521.44

DuckDB

con = duckdb.connect()
df_duck = con.execute("SELECT * FROM 'EuropeSales.csv'").df()
df_duck.head(3)
Region Country Item Type Sales Channel Order Priority Order Date Order ID Ship Date Units Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
0 Europe Czech Republic Beverages Offline C 2011-09-12 478051030 2011-09-29 4778 47.45 31.79 226716.10 151892.62 74823.48
1 Europe Bosnia and Herzegovina Clothes Online M 2013-10-14 919133651 2013-11-04 927 109.28 35.84 101302.56 33223.68 68078.88
2 Europe Austria Cereal Offline C 2014-08-13 987410676 2014-09-06 5616 205.70 117.11 1155211.20 657689.76 497521.44

Filtrado de Datos

Pandas

fpd = df_pd[df_pd.Country == "Ukraine"]
fpd.head(10)
Region Country Item Type Sales Channel Order Priority Order Date Order ID Ship Date Units Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
15 Europe Ukraine Cosmetics Online H 1/22/2011 773645913 1/28/2011 7873 437.20 263.33 3442075.60 2073197.09 1368878.51
25 Europe Ukraine Personal Care Online M 7/16/2010 987459170 8/27/2010 9967 81.73 56.67 814602.91 564829.89 249773.02
156 Europe Ukraine Cosmetics Online C 6/9/2014 183910119 6/11/2014 1946 437.20 263.33 850791.20 512440.18 338351.02
164 Europe Ukraine Snacks Offline C 1/25/2016 611791121 3/3/2016 1443 152.58 97.44 220172.94 140605.92 79567.02
226 Europe Ukraine Office Supplies Offline M 5/3/2017 181260526 6/3/2017 7872 651.21 524.96 5126325.12 4132485.12 993840.00
292 Europe Ukraine Vegetables Online C 11/6/2011 446019017 11/25/2011 5794 154.06 90.93 892623.64 526848.42 365775.22
296 Europe Ukraine Clothes Offline H 9/27/2016 747955071 10/25/2016 5382 109.28 35.84 588144.96 192890.88 395254.08
343 Europe Ukraine Household Online H 7/28/2010 741866242 9/13/2010 3562 668.27 502.54 2380377.74 1790047.48 590330.26
407 Europe Ukraine Meat Offline H 4/25/2017 783594977 5/1/2017 2022 421.89 364.69 853061.58 737403.18 115658.40
488 Europe Ukraine Cosmetics Online C 7/8/2014 140461766 7/20/2014 4613 437.20 263.33 2016803.60 1214741.29 802062.31

Polars

fpd = df_pl.filter(pl.col("Country") == "Ukraine")
fpd.head(10)
RegionCountryItem TypeSales ChannelOrder PriorityOrder DateOrder IDShip DateUnits SoldUnit PriceUnit CostTotal RevenueTotal CostTotal Profit
strstrstrstrstrstri64stri64f64f64f64f64f64
"Europe""Ukraine""Cosmetics""Online""H""1/22/2011"773645913"1/28/2011"7873437.2263.333442075.62.0732e61.3689e6
"Europe""Ukraine""Personal Care""Online""M""7/16/2010"987459170"8/27/2010"996781.7356.67814602.91564829.89249773.02
"Europe""Ukraine""Cosmetics""Online""C""6/9/2014"183910119"6/11/2014"1946437.2263.33850791.2512440.18338351.02
"Europe""Ukraine""Snacks""Offline""C""1/25/2016"611791121"3/3/2016"1443152.5897.44220172.94140605.9279567.02
"Europe""Ukraine""Office Supplies""Offline""M""5/3/2017"181260526"6/3/2017"7872651.21524.965.1263e64.1325e6993840.0
"Europe""Ukraine""Vegetables""Online""C""11/6/2011"446019017"11/25/2011"5794154.0690.93892623.64526848.42365775.22
"Europe""Ukraine""Clothes""Offline""H""9/27/2016"747955071"10/25/2016"5382109.2835.84588144.96192890.88395254.08
"Europe""Ukraine""Household""Online""H""7/28/2010"741866242"9/13/2010"3562668.27502.542.3804e61.7900e6590330.26
"Europe""Ukraine""Meat""Offline""H""4/25/2017"783594977"5/1/2017"2022421.89364.69853061.58737403.18115658.4
"Europe""Ukraine""Cosmetics""Online""C""7/8/2014"140461766"7/20/2014"4613437.2263.332016803.61.2147e6802062.31

DuckDB

df_duck = con.execute("SELECT * FROM 'EuropeSales.csv' WHERE Country LIKE 'Ukraine'").df()
df_duck.head(10)
Region Country Item Type Sales Channel Order Priority Order Date Order ID Ship Date Units Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
0 Europe Ukraine Cosmetics Online H 2011-01-22 773645913 2011-01-28 7873 437.20 263.33 3442075.60 2073197.09 1368878.51
1 Europe Ukraine Personal Care Online M 2010-07-16 987459170 2010-08-27 9967 81.73 56.67 814602.91 564829.89 249773.02
2 Europe Ukraine Cosmetics Online C 2014-06-09 183910119 2014-06-11 1946 437.20 263.33 850791.20 512440.18 338351.02
3 Europe Ukraine Snacks Offline C 2016-01-25 611791121 2016-03-03 1443 152.58 97.44 220172.94 140605.92 79567.02
4 Europe Ukraine Office Supplies Offline M 2017-05-03 181260526 2017-06-03 7872 651.21 524.96 5126325.12 4132485.12 993840.00
5 Europe Ukraine Vegetables Online C 2011-11-06 446019017 2011-11-25 5794 154.06 90.93 892623.64 526848.42 365775.22
6 Europe Ukraine Clothes Offline H 2016-09-27 747955071 2016-10-25 5382 109.28 35.84 588144.96 192890.88 395254.08
7 Europe Ukraine Household Online H 2010-07-28 741866242 2010-09-13 3562 668.27 502.54 2380377.74 1790047.48 590330.26
8 Europe Ukraine Meat Offline H 2017-04-25 783594977 2017-05-01 2022 421.89 364.69 853061.58 737403.18 115658.40
9 Europe Ukraine Cosmetics Online C 2014-07-08 140461766 2014-07-20 4613 437.20 263.33 2016803.60 1214741.29 802062.31

Agregar Datos

Pandas

agg_pd = df_pd.groupby("Country")["Total Revenue"].sum().reset_index()
agg_pd.head(10)
Country Total Revenue
0 Albania 32224853.87
1 Andorra 47756693.17
2 Armenia 37519840.21
3 Austria 35740871.49
4 Belarus 34236260.77
5 Belgium 25852572.30
6 Bosnia and Herzegovina 50117508.49
7 Bulgaria 38161555.70
8 Croatia 27348195.70
9 Cyprus 33008851.50

Polars

agg_pl = df_pl.group_by("Country").agg([pl.sum("Total Revenue").alias("country_revenue")])
agg_pl.head(15)
Countrycountry_revenue
strf64
"Switzerland"3.1875174e7
"Liechtenstein"2.9873e7
"Italy"3.5878e7
"Croatia"2.7348e7
"Slovakia"4.2941e7
"Armenia"3.7520e7
"Macedonia"4.9222e7
"Montenegro"3.1346e7
"Latvia"3.8722e7
"Austria"3.5741e7

DuckDB

agg_duck = con.execute("""
    SELECT Country, sum("Total Revenue") AS country_revenue
    FROM 'EuropeSales.csv'
    GROUP BY country
    ORDER BY country
""").df()
agg_duck.head(10)
Country country_revenue
0 Albania 32224853.87
1 Andorra 47756693.17
2 Armenia 37519840.21
3 Austria 35740871.49
4 Belarus 34236260.77
5 Belgium 25852572.30
6 Bosnia and Herzegovina 50117508.49
7 Bulgaria 38161555.70
8 Croatia 27348195.70
9 Cyprus 33008851.50