{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 17 - Partition Projection\n", "\n", "https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import getpass\n", "from datetime import datetime\n", "\n", "import pandas as pd\n", "\n", "import awswrangler as wr" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ···········································\n" ] } ], "source": [ "bucket = getpass.getpass()" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Integer projection" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valueyearmonthday
0120191025
1220201126
2320211227
\n", "
" ], "text/plain": [ " value year month day\n", "0 1 2019 10 25\n", "1 2 2020 11 26\n", "2 3 2021 12 27" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\"value\": [1, 2, 3], \"year\": [2019, 2020, 2021], \"month\": [10, 11, 12], \"day\": [25, 26, 27]})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_integer/\",\n", " dataset=True,\n", " partition_cols=[\"year\", \"month\", \"day\"],\n", " database=\"default\",\n", " table=\"table_integer\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\"year\": \"integer\", \"month\": \"integer\", \"day\": \"integer\"},\n", " \"projection_ranges\": {\"year\": \"2000,2025\", \"month\": \"1,12\", \"day\": \"1,31\"},\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valueyearmonthday
0320211227
1220201126
2120191025
\n", "
" ], "text/plain": [ " value year month day\n", "0 3 2021 12 27\n", "1 2 2020 11 26\n", "2 1 2019 10 25" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\"SELECT * FROM table_integer\", database=\"default\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Enum projection" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valuecity
01São Paulo
12Tokio
23Seattle
\n", "
" ], "text/plain": [ " value city\n", "0 1 São Paulo\n", "1 2 Tokio\n", "2 3 Seattle" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"value\": [1, 2, 3],\n", " \"city\": [\"São Paulo\", \"Tokio\", \"Seattle\"],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_enum/\",\n", " dataset=True,\n", " partition_cols=[\"city\"],\n", " database=\"default\",\n", " table=\"table_enum\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\n", " \"city\": \"enum\",\n", " },\n", " \"projection_values\": {\"city\": \"São Paulo,Tokio,Seattle\"},\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valuecity
01São Paulo
13Seattle
22Tokio
\n", "
" ], "text/plain": [ " value city\n", "0 1 São Paulo\n", "1 3 Seattle\n", "2 2 Tokio" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\"SELECT * FROM table_enum\", database=\"default\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Date projection" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valuedtts
012020-01-012020-01-01 00:00:00
122020-01-022020-01-01 00:00:01
232020-01-032020-01-01 00:00:02
\n", "
" ], "text/plain": [ " value dt ts\n", "0 1 2020-01-01 2020-01-01 00:00:00\n", "1 2 2020-01-02 2020-01-01 00:00:01\n", "2 3 2020-01-03 2020-01-01 00:00:02" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def ts(x):\n", " return datetime.strptime(x, \"%Y-%m-%d %H:%M:%S\")\n", "\n", "\n", "def dt(x):\n", " return datetime.strptime(x, \"%Y-%m-%d\").date()\n", "\n", "\n", "df = pd.DataFrame(\n", " {\n", " \"value\": [1, 2, 3],\n", " \"dt\": [dt(\"2020-01-01\"), dt(\"2020-01-02\"), dt(\"2020-01-03\")],\n", " \"ts\": [ts(\"2020-01-01 00:00:00\"), ts(\"2020-01-01 00:00:01\"), ts(\"2020-01-01 00:00:02\")],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_date/\",\n", " dataset=True,\n", " partition_cols=[\"dt\", \"ts\"],\n", " database=\"default\",\n", " table=\"table_date\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\n", " \"dt\": \"date\",\n", " \"ts\": \"date\",\n", " },\n", " \"projection_ranges\": {\"dt\": \"2020-01-01,2020-01-03\", \"ts\": \"2020-01-01 00:00:00,2020-01-01 00:00:02\"},\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valuedtts
012020-01-012020-01-01 00:00:00
122020-01-022020-01-01 00:00:01
232020-01-032020-01-01 00:00:02
\n", "
" ], "text/plain": [ " value dt ts\n", "0 1 2020-01-01 2020-01-01 00:00:00\n", "1 2 2020-01-02 2020-01-01 00:00:01\n", "2 3 2020-01-03 2020-01-01 00:00:02" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\"SELECT * FROM table_date\", database=\"default\")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Injected projection" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valueuuid
01761e2488-a078-11ea-bb37-0242ac130002
12b89ed095-8179-4635-9537-88592c0f6bc3
2387adc586-ce88-4f0a-b1c8-bf8e00d32249
\n", "
" ], "text/plain": [ " value uuid\n", "0 1 761e2488-a078-11ea-bb37-0242ac130002\n", "1 2 b89ed095-8179-4635-9537-88592c0f6bc3\n", "2 3 87adc586-ce88-4f0a-b1c8-bf8e00d32249" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"value\": [1, 2, 3],\n", " \"uuid\": [\n", " \"761e2488-a078-11ea-bb37-0242ac130002\",\n", " \"b89ed095-8179-4635-9537-88592c0f6bc3\",\n", " \"87adc586-ce88-4f0a-b1c8-bf8e00d32249\",\n", " ],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.to_parquet(\n", " df=df,\n", " path=f\"s3://{bucket}/table_injected/\",\n", " dataset=True,\n", " partition_cols=[\"uuid\"],\n", " database=\"default\",\n", " table=\"table_injected\",\n", " athena_partition_projection_settings={\n", " \"projection_types\": {\n", " \"uuid\": \"injected\",\n", " }\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valueuuid
02b89ed095-8179-4635-9537-88592c0f6bc3
\n", "
" ], "text/plain": [ " value uuid\n", "0 2 b89ed095-8179-4635-9537-88592c0f6bc3" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.athena.read_sql_query(\n", " sql=\"SELECT * FROM table_injected WHERE uuid='b89ed095-8179-4635-9537-88592c0f6bc3'\", database=\"default\"\n", ")" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Cleaning Up" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.s3.delete_objects(f\"s3://{bucket}/table_integer/\")\n", "wr.s3.delete_objects(f\"s3://{bucket}/table_enum/\")\n", "wr.s3.delete_objects(f\"s3://{bucket}/table_date/\")\n", "wr.s3.delete_objects(f\"s3://{bucket}/table_injected/\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "wr.catalog.delete_table_if_exists(table=\"table_integer\", database=\"default\")\n", "wr.catalog.delete_table_if_exists(table=\"table_enum\", database=\"default\")\n", "wr.catalog.delete_table_if_exists(table=\"table_date\", database=\"default\")\n", "wr.catalog.delete_table_if_exists(table=\"table_injected\", database=\"default\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.14", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" } }, "nbformat": 4, "nbformat_minor": 4 }