Pandas vs Polars vs DuckDB
import pandas as pd
import polars as pl
import duckdb
Carga de datos (CSVs)
Pandas
df_pd = pd.read_csv("EuropeSales.csv")
df_pd.head(3)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Order ID | Ship Date | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit |
| 0 | Europe | Czech Republic | Beverages | Offline | C | 9/12/2011 | 478051030 | 9/29/2011 | 4778 | 47.45 | 31.79 | 226716.10 | 151892.62 | 74823.48 |
| 1 | Europe | Bosnia and Herzegovina | Clothes | Online | M | 10/14/2013 | 919133651 | 11/4/2013 | 927 | 109.28 | 35.84 | 101302.56 | 33223.68 | 68078.88 |
| 2 | Europe | Austria | Cereal | Offline | C | 8/13/2014 | 987410676 | 9/6/2014 | 5616 | 205.70 | 117.11 | 1155211.20 | 657689.76 | 497521.44 |
Polars
df_pl = pl.read_csv("EuropeSales.csv")
df_pl.head(3)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Order ID | Ship Date | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit |
|---|
| str | str | str | str | str | str | i64 | str | i64 | f64 | f64 | f64 | f64 | f64 |
| "Europe" | "Czech Republic" | "Beverages" | "Offline" | "C" | "9/12/2011" | 478051030 | "9/29/2011" | 4778 | 47.45 | 31.79 | 226716.1 | 151892.62 | 74823.48 |
| "Europe" | "Bosnia and Herzegovina" | "Clothes" | "Online" | "M" | "10/14/2013" | 919133651 | "11/4/2013" | 927 | 109.28 | 35.84 | 101302.56 | 33223.68 | 68078.88 |
| "Europe" | "Austria" | "Cereal" | "Offline" | "C" | "8/13/2014" | 987410676 | "9/6/2014" | 5616 | 205.7 | 117.11 | 1155211.2 | 657689.76 | 497521.44 |
DuckDB
con = duckdb.connect()
df_duck = con.execute("SELECT * FROM 'EuropeSales.csv'").df()
df_duck.head(3)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Order ID | Ship Date | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit |
| 0 | Europe | Czech Republic | Beverages | Offline | C | 2011-09-12 | 478051030 | 2011-09-29 | 4778 | 47.45 | 31.79 | 226716.10 | 151892.62 | 74823.48 |
| 1 | Europe | Bosnia and Herzegovina | Clothes | Online | M | 2013-10-14 | 919133651 | 2013-11-04 | 927 | 109.28 | 35.84 | 101302.56 | 33223.68 | 68078.88 |
| 2 | Europe | Austria | Cereal | Offline | C | 2014-08-13 | 987410676 | 2014-09-06 | 5616 | 205.70 | 117.11 | 1155211.20 | 657689.76 | 497521.44 |
Filtrado de Datos
Pandas
fpd = df_pd[df_pd.Country == "Ukraine"]
fpd.head(10)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Order ID | Ship Date | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit |
| 15 | Europe | Ukraine | Cosmetics | Online | H | 1/22/2011 | 773645913 | 1/28/2011 | 7873 | 437.20 | 263.33 | 3442075.60 | 2073197.09 | 1368878.51 |
| 25 | Europe | Ukraine | Personal Care | Online | M | 7/16/2010 | 987459170 | 8/27/2010 | 9967 | 81.73 | 56.67 | 814602.91 | 564829.89 | 249773.02 |
| 156 | Europe | Ukraine | Cosmetics | Online | C | 6/9/2014 | 183910119 | 6/11/2014 | 1946 | 437.20 | 263.33 | 850791.20 | 512440.18 | 338351.02 |
| 164 | Europe | Ukraine | Snacks | Offline | C | 1/25/2016 | 611791121 | 3/3/2016 | 1443 | 152.58 | 97.44 | 220172.94 | 140605.92 | 79567.02 |
| 226 | Europe | Ukraine | Office Supplies | Offline | M | 5/3/2017 | 181260526 | 6/3/2017 | 7872 | 651.21 | 524.96 | 5126325.12 | 4132485.12 | 993840.00 |
| 292 | Europe | Ukraine | Vegetables | Online | C | 11/6/2011 | 446019017 | 11/25/2011 | 5794 | 154.06 | 90.93 | 892623.64 | 526848.42 | 365775.22 |
| 296 | Europe | Ukraine | Clothes | Offline | H | 9/27/2016 | 747955071 | 10/25/2016 | 5382 | 109.28 | 35.84 | 588144.96 | 192890.88 | 395254.08 |
| 343 | Europe | Ukraine | Household | Online | H | 7/28/2010 | 741866242 | 9/13/2010 | 3562 | 668.27 | 502.54 | 2380377.74 | 1790047.48 | 590330.26 |
| 407 | Europe | Ukraine | Meat | Offline | H | 4/25/2017 | 783594977 | 5/1/2017 | 2022 | 421.89 | 364.69 | 853061.58 | 737403.18 | 115658.40 |
| 488 | Europe | Ukraine | Cosmetics | Online | C | 7/8/2014 | 140461766 | 7/20/2014 | 4613 | 437.20 | 263.33 | 2016803.60 | 1214741.29 | 802062.31 |
Polars
fpd = df_pl.filter(pl.col("Country") == "Ukraine")
fpd.head(10)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Order ID | Ship Date | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit |
|---|
| str | str | str | str | str | str | i64 | str | i64 | f64 | f64 | f64 | f64 | f64 |
| "Europe" | "Ukraine" | "Cosmetics" | "Online" | "H" | "1/22/2011" | 773645913 | "1/28/2011" | 7873 | 437.2 | 263.33 | 3442075.6 | 2.0732e6 | 1.3689e6 |
| "Europe" | "Ukraine" | "Personal Care" | "Online" | "M" | "7/16/2010" | 987459170 | "8/27/2010" | 9967 | 81.73 | 56.67 | 814602.91 | 564829.89 | 249773.02 |
| "Europe" | "Ukraine" | "Cosmetics" | "Online" | "C" | "6/9/2014" | 183910119 | "6/11/2014" | 1946 | 437.2 | 263.33 | 850791.2 | 512440.18 | 338351.02 |
| "Europe" | "Ukraine" | "Snacks" | "Offline" | "C" | "1/25/2016" | 611791121 | "3/3/2016" | 1443 | 152.58 | 97.44 | 220172.94 | 140605.92 | 79567.02 |
| "Europe" | "Ukraine" | "Office Supplies" | "Offline" | "M" | "5/3/2017" | 181260526 | "6/3/2017" | 7872 | 651.21 | 524.96 | 5.1263e6 | 4.1325e6 | 993840.0 |
| "Europe" | "Ukraine" | "Vegetables" | "Online" | "C" | "11/6/2011" | 446019017 | "11/25/2011" | 5794 | 154.06 | 90.93 | 892623.64 | 526848.42 | 365775.22 |
| "Europe" | "Ukraine" | "Clothes" | "Offline" | "H" | "9/27/2016" | 747955071 | "10/25/2016" | 5382 | 109.28 | 35.84 | 588144.96 | 192890.88 | 395254.08 |
| "Europe" | "Ukraine" | "Household" | "Online" | "H" | "7/28/2010" | 741866242 | "9/13/2010" | 3562 | 668.27 | 502.54 | 2.3804e6 | 1.7900e6 | 590330.26 |
| "Europe" | "Ukraine" | "Meat" | "Offline" | "H" | "4/25/2017" | 783594977 | "5/1/2017" | 2022 | 421.89 | 364.69 | 853061.58 | 737403.18 | 115658.4 |
| "Europe" | "Ukraine" | "Cosmetics" | "Online" | "C" | "7/8/2014" | 140461766 | "7/20/2014" | 4613 | 437.2 | 263.33 | 2016803.6 | 1.2147e6 | 802062.31 |
DuckDB
df_duck = con.execute("SELECT * FROM 'EuropeSales.csv' WHERE Country LIKE 'Ukraine'").df()
df_duck.head(10)
| Region | Country | Item Type | Sales Channel | Order Priority | Order Date | Order ID | Ship Date | Units Sold | Unit Price | Unit Cost | Total Revenue | Total Cost | Total Profit |
| 0 | Europe | Ukraine | Cosmetics | Online | H | 2011-01-22 | 773645913 | 2011-01-28 | 7873 | 437.20 | 263.33 | 3442075.60 | 2073197.09 | 1368878.51 |
| 1 | Europe | Ukraine | Personal Care | Online | M | 2010-07-16 | 987459170 | 2010-08-27 | 9967 | 81.73 | 56.67 | 814602.91 | 564829.89 | 249773.02 |
| 2 | Europe | Ukraine | Cosmetics | Online | C | 2014-06-09 | 183910119 | 2014-06-11 | 1946 | 437.20 | 263.33 | 850791.20 | 512440.18 | 338351.02 |
| 3 | Europe | Ukraine | Snacks | Offline | C | 2016-01-25 | 611791121 | 2016-03-03 | 1443 | 152.58 | 97.44 | 220172.94 | 140605.92 | 79567.02 |
| 4 | Europe | Ukraine | Office Supplies | Offline | M | 2017-05-03 | 181260526 | 2017-06-03 | 7872 | 651.21 | 524.96 | 5126325.12 | 4132485.12 | 993840.00 |
| 5 | Europe | Ukraine | Vegetables | Online | C | 2011-11-06 | 446019017 | 2011-11-25 | 5794 | 154.06 | 90.93 | 892623.64 | 526848.42 | 365775.22 |
| 6 | Europe | Ukraine | Clothes | Offline | H | 2016-09-27 | 747955071 | 2016-10-25 | 5382 | 109.28 | 35.84 | 588144.96 | 192890.88 | 395254.08 |
| 7 | Europe | Ukraine | Household | Online | H | 2010-07-28 | 741866242 | 2010-09-13 | 3562 | 668.27 | 502.54 | 2380377.74 | 1790047.48 | 590330.26 |
| 8 | Europe | Ukraine | Meat | Offline | H | 2017-04-25 | 783594977 | 2017-05-01 | 2022 | 421.89 | 364.69 | 853061.58 | 737403.18 | 115658.40 |
| 9 | Europe | Ukraine | Cosmetics | Online | C | 2014-07-08 | 140461766 | 2014-07-20 | 4613 | 437.20 | 263.33 | 2016803.60 | 1214741.29 | 802062.31 |
Agregar Datos
Pandas
agg_pd = df_pd.groupby("Country")["Total Revenue"].sum().reset_index()
agg_pd.head(10)
| Country | Total Revenue |
| 0 | Albania | 32224853.87 |
| 1 | Andorra | 47756693.17 |
| 2 | Armenia | 37519840.21 |
| 3 | Austria | 35740871.49 |
| 4 | Belarus | 34236260.77 |
| 5 | Belgium | 25852572.30 |
| 6 | Bosnia and Herzegovina | 50117508.49 |
| 7 | Bulgaria | 38161555.70 |
| 8 | Croatia | 27348195.70 |
| 9 | Cyprus | 33008851.50 |
Polars
agg_pl = df_pl.group_by("Country").agg([pl.sum("Total Revenue").alias("country_revenue")])
agg_pl.head(15)
| Country | country_revenue |
|---|
| str | f64 |
| "Switzerland" | 3.1875174e7 |
| "Liechtenstein" | 2.9873e7 |
| "Italy" | 3.5878e7 |
| "Croatia" | 2.7348e7 |
| "Slovakia" | 4.2941e7 |
| … | … |
| "Armenia" | 3.7520e7 |
| "Macedonia" | 4.9222e7 |
| "Montenegro" | 3.1346e7 |
| "Latvia" | 3.8722e7 |
| "Austria" | 3.5741e7 |
DuckDB
agg_duck = con.execute("""
SELECT Country, sum("Total Revenue") AS country_revenue
FROM 'EuropeSales.csv'
GROUP BY country
ORDER BY country
""").df()
agg_duck.head(10)
| Country | country_revenue |
| 0 | Albania | 32224853.87 |
| 1 | Andorra | 47756693.17 |
| 2 | Armenia | 37519840.21 |
| 3 | Austria | 35740871.49 |
| 4 | Belarus | 34236260.77 |
| 5 | Belgium | 25852572.30 |
| 6 | Bosnia and Herzegovina | 50117508.49 |
| 7 | Bulgaria | 38161555.70 |
| 8 | Croatia | 27348195.70 |
| 9 | Cyprus | 33008851.50 |