feat: Onboard American Community Survey dataset (#222)

GoogleCloudPlatform · Dec 29, 2021 · 861d0e6 · 861d0e6
1 parent fe6c826
commit 861d0e6
Show file tree

Hide file tree

Showing 65 changed files with 27,078 additions and 1 deletion.
diff --git a/Pipfile b/Pipfile
@@ -26,4 +26,4 @@ Jinja2 = "==2.11.3"
 SQLAlchemy = "==1.3.24"
 
 [requires]
-python_version = "3.8"
+python_version = "3.8"
diff --git a/datasets/census_bureau_acs/_images/run_csv_transform_kub/Dockerfile b/datasets/census_bureau_acs/_images/run_csv_transform_kub/Dockerfile
@@ -0,0 +1,39 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The base image for this build
+FROM python:3.8
+
+# Allow statements and log messages to appear in Cloud logs
+ENV PYTHONUNBUFFERED True
+
+# Copy the requirements file into the image
+COPY requirements.txt ./
+
+# Install the packages specified in the requirements file
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
+
+# The WORKDIR instruction sets the working directory for any RUN, CMD,
+# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile.
+# If the WORKDIR doesn’t exist, it will be created even if it’s not used in
+# any subsequent Dockerfile instruction
+WORKDIR /custom
+
+# Copy the specific data processing script/s in the image under /custom/*
+COPY ./csv_transform.py .
+COPY ./group_ids.json .
+COPY ./state_codes.json .
+
+# Command to run the data processing script when the container is run
+CMD ["python3", "csv_transform.py"]
diff --git a/datasets/census_bureau_acs/_images/run_csv_transform_kub/csv_transform.py b/datasets/census_bureau_acs/_images/run_csv_transform_kub/csv_transform.py
@@ -0,0 +1,228 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import datetime
+import json
+import logging
+import os
+import pathlib
+import typing
+
+import numpy as np
+import pandas as pd
+import requests
+from google.cloud import storage
+
+
+def main(
+    source_url: str,
+    year_report: str,
+    api_naming_convention: str,
+    target_file: pathlib.Path,
+    target_gcs_bucket: str,
+    target_gcs_path: str,
+    headers: typing.List[str],
+    rename_mappings: dict,
+    pipeline_name: str,
+    geography: str,
+    report_level: str,
+    concat_col: typing.List[str],
+) -> None:
+
+    logging.info(
+        f"ACS {pipeline_name} process started at "
+        + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+    )
+
+    logging.info("Creating 'files' folder")
+    pathlib.Path("./files").mkdir(parents=True, exist_ok=True)
+
+    json_obj_group_id = open("group_ids.json")
+    group_id = json.load(json_obj_group_id)
+
+    json_obj_state_code = open("state_codes.json")
+    state_code = json.load(json_obj_state_code)
+
+    logging.info("Extracting the data from API and loading into dataframe...")
+    if report_level == "national_level":
+        df = extract_data_and_convert_to_df_national_level(
+            group_id, year_report, api_naming_convention, source_url
+        )
+    elif report_level == "state_level":
+        df = extract_data_and_convert_to_df_state_level(
+            group_id, state_code, year_report, api_naming_convention, source_url
+        )
+
+    logging.info("Replacing values...")
+    df = df.replace(to_replace={"KPI_Name": group_id})
+
+    logging.info("Renaming headers...")
+    rename_headers(df, rename_mappings)
+
+    logging.info("Creating column geo_id...")
+    if geography == "censustract" or geography == "blockgroup":
+        df["tract"] = df["tract"].apply(pad_zeroes_to_the_left, args=(6,))
+        df["state"] = df["state"].apply(pad_zeroes_to_the_left, args=(2,))
+        df["county"] = df["county"].apply(pad_zeroes_to_the_left, args=(3,))
+
+    df = create_geo_id(df, concat_col)
+
+    logging.info("Pivoting the dataframe...")
+    df = df[["geo_id", "KPI_Name", "KPI_Value"]]
+    df = df.pivot_table(
+        index="geo_id", columns="KPI_Name", values="KPI_Value", aggfunc=np.sum
+    ).reset_index()
+
+    logging.info("Reordering headers...")
+    df = df[headers]
+
+    logging.info(f"Saving to output file.. {target_file}")
+    try:
+        save_to_new_file(df, file_path=str(target_file))
+    except Exception as e:
+        logging.error(f"Error saving output file: {e}.")
+
+    logging.info(
+        f"Uploading output file to.. gs://{target_gcs_bucket}/{target_gcs_path}"
+    )
+    upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path)
+
+    logging.info(
+        f"ACS {pipeline_name} process completed at "
+        + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+    )
+
+
+def string_replace(source_url, replace: dict) -> str:
+    for k, v in replace.items():
+        source_url_new = source_url.replace(k, v)
+    return source_url_new
+
+
+def extract_data_and_convert_to_df_national_level(
+    group_id: dict, year_report: str, api_naming_convention: str, source_url: str
+) -> pd.DataFrame:
+    list_temp = []
+    for key in group_id:
+        logging.info(f"reading data from API for KPI {key}...")
+        str1 = source_url.replace("~year_report~", year_report)
+        str2 = str1.replace("~group_id~", key[0:-3])
+        str3 = str2.replace("~row_position~", key[-3:])
+        source_url_new = str3.replace("~api_naming_convention~", api_naming_convention)
+        try:
+            r = requests.get(source_url_new, stream=True)
+            logging.info(f"Source url : {source_url_new}")
+            logging.info(f"status code : {r.status_code}")
+            if r.status_code == 200:
+                text = r.json()
+                frame = load_nested_list_into_df_without_headers(text)
+                frame["KPI_Name"] = key
+                list_temp.append(frame)
+        except OSError as e:
+            logging.info(f"error : {e}")
+            pass
+    logging.info("creating the dataframe...")
+    df = pd.concat(list_temp)
+    return df
+
+
+def load_nested_list_into_df_without_headers(text: typing.List) -> pd.DataFrame:
+    frame = pd.DataFrame(text)
+    frame = frame.iloc[1:, :]
+    return frame
+
+
+def extract_data_and_convert_to_df_state_level(
+    group_id: dict,
+    state_code: dict,
+    year_report: str,
+    api_naming_convention: str,
+    source_url: str,
+) -> pd.DataFrame:
+    list_temp = []
+    for key in group_id:
+        for sc in state_code:
+            logging.info(f"reading data from API for KPI {key}...")
+            logging.info(f"reading data from API for KPI {sc}...")
+            str1 = source_url.replace("~year_report~", year_report)
+            str2 = str1.replace("~group_id~", key[0:-3])
+            str3 = str2.replace("~row_position~", key[-3:])
+            str4 = str3.replace("~api_naming_convention~", api_naming_convention)
+            source_url_new = str4.replace("~state_code~", sc)
+            try:
+                r = requests.get(source_url_new, stream=True)
+                logging.info(f"Source url : {source_url_new}")
+                logging.info(f"status code : {r.status_code}")
+                if r.status_code == 200:
+                    text = r.json()
+                    frame = load_nested_list_into_df_without_headers(text)
+                    frame["KPI_Name"] = key
+                    list_temp.append(frame)
+            except OSError as e:
+                logging.info(f"error : {e}")
+                pass
+
+    logging.info("creating the dataframe...")
+    df = pd.concat(list_temp)
+    return df
+
+
+def create_geo_id(df: pd.DataFrame, concat_col: str) -> pd.DataFrame:
+    df["geo_id"] = ""
+    for col in concat_col:
+        df["geo_id"] = df["geo_id"] + df[col]
+    return df
+
+
+def pad_zeroes_to_the_left(val: str, length: int) -> str:
+    if len(str(val)) < length:
+        return ("0" * (length - len(str(val)))) + str(val)
+    else:
+        return str(val)
+
+
+def rename_headers(df: pd.DataFrame, rename_mappings: dict) -> None:
+    rename_mappings = {int(k): str(v) for k, v in rename_mappings.items()}
+    df.rename(columns=rename_mappings, inplace=True)
+
+
+def save_to_new_file(df: pd.DataFrame, file_path: str) -> None:
+    df.to_csv(file_path, index=False)
+
+
+def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None:
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(gcs_bucket)
+    blob = bucket.blob(gcs_path)
+    blob.upload_from_filename(file_path)
+
+
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.INFO)
+
+    main(
+        source_url=os.environ["SOURCE_URL"],
+        year_report=os.environ["YEAR_REPORT"],
+        api_naming_convention=os.environ["API_NAMING_CONVENTION"],
+        target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(),
+        target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"],
+        target_gcs_path=os.environ["TARGET_GCS_PATH"],
+        headers=json.loads(os.environ["CSV_HEADERS"]),
+        rename_mappings=json.loads(os.environ["RENAME_MAPPINGS"]),
+        pipeline_name=os.environ["PIPELINE_NAME"],
+        geography=os.environ["GEOGRAPHY"],
+        report_level=os.environ["REPORT_LEVEL"],
+        concat_col=json.loads(os.environ["CONCAT_COL"]),
+    )
diff --git a/datasets/census_bureau_acs/_images/run_csv_transform_kub/requirements.txt b/datasets/census_bureau_acs/_images/run_csv_transform_kub/requirements.txt
@@ -0,0 +1,5 @@
+requests
+pandas
+google-cloud-storage
+numpy
+
diff --git a/datasets/census_bureau_acs/_terraform/cbsa_2019_1yr_pipeline.tf b/datasets/census_bureau_acs/_terraform/cbsa_2019_1yr_pipeline.tf
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_table" "cbsa_2019_1yr" {
+  project    = var.project_id
+  dataset_id = "census_bureau_acs"
+  table_id   = "cbsa_2019_1yr"
+
+  description = "CBSA 2019 1 year report table"
+
+
+
+
+  depends_on = [
+    google_bigquery_dataset.census_bureau_acs
+  ]
+}
+
+output "bigquery_table-cbsa_2019_1yr-table_id" {
+  value = google_bigquery_table.cbsa_2019_1yr.table_id
+}
+
+output "bigquery_table-cbsa_2019_1yr-id" {
+  value = google_bigquery_table.cbsa_2019_1yr.id
+}
diff --git a/datasets/census_bureau_acs/_terraform/cbsa_2019_5yr_pipeline.tf b/datasets/census_bureau_acs/_terraform/cbsa_2019_5yr_pipeline.tf
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_table" "cbsa_2019_5yr" {
+  project    = var.project_id
+  dataset_id = "census_bureau_acs"
+  table_id   = "cbsa_2019_5yr"
+
+  description = "CBSA 2019 5 years report table"
+
+
+
+
+  depends_on = [
+    google_bigquery_dataset.census_bureau_acs
+  ]
+}
+
+output "bigquery_table-cbsa_2019_5yr-table_id" {
+  value = google_bigquery_table.cbsa_2019_5yr.table_id
+}
+
+output "bigquery_table-cbsa_2019_5yr-id" {
+  value = google_bigquery_table.cbsa_2019_5yr.id
+}
diff --git a/datasets/census_bureau_acs/_terraform/census_bureau_acs_dataset.tf b/datasets/census_bureau_acs/_terraform/census_bureau_acs_dataset.tf
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_dataset" "census_bureau_acs" {
+  dataset_id  = "census_bureau_acs"
+  project     = var.project_id
+  description = "American Comunity Survey dataset"
+}
+
+output "bigquery_dataset-census_bureau_acs-dataset_id" {
+  value = google_bigquery_dataset.census_bureau_acs.dataset_id
+}
+
+resource "google_storage_bucket" "census-bureau-acs" {
+  name                        = "${var.bucket_name_prefix}-census-bureau-acs"
+  force_destroy               = true
+  location                    = "US"
+  uniform_bucket_level_access = true
+}
+
+output "storage_bucket-census-bureau-acs-name" {
+  value = google_storage_bucket.census-bureau-acs.name
+}