From 58e5dfa620600ef5564d6f33a2df6f58d4dbc9fb Mon Sep 17 00:00:00 2001 From: Lorenzo Caggioni Date: Tue, 30 Jun 2020 10:56:27 +0200 Subject: [PATCH] Fixes --- data-solutions/gcs-to-bq/README.md | 27 ++++++++++++++--- data-solutions/gcs-to-bq/main.tf | 5 ++-- data-solutions/gcs-to-bq/scripts/README.md | 4 +++ .../scripts/data_ingestion/README.md | 29 +++++++++++++++---- .../scripts/data_ingestion/REQUIREMENTS.txt | 5 ++-- 5 files changed, 56 insertions(+), 14 deletions(-) create mode 100644 data-solutions/gcs-to-bq/scripts/README.md diff --git a/data-solutions/gcs-to-bq/README.md b/data-solutions/gcs-to-bq/README.md index f1a13578f..46c19467b 100644 --- a/data-solutions/gcs-to-bq/README.md +++ b/data-solutions/gcs-to-bq/README.md @@ -69,7 +69,7 @@ This sample creates several distinct groups of resources: | vm | GCE VMs. | | -## Test your environment +## Test your environment with Cloud Dataflow You can now connect to the GCE instance with the following command: ```hcl @@ -106,14 +106,33 @@ python data_ingestion.py \ --region=europe-west1 \ --staging_location=gs://lc-001-eu-df-tmplocation/ \ --temp_location=gs://lc-001-eu-df-tmplocation/ \ ---project=lcaggio-demo-001 \ ---input=gs://lc-001-eu-data/person.csv \ +--project=lcaggio-demo \ +--input=gs://lc-eu-data/person.csv \ --output=bq_dataset.df_import \ ---service_account_email=df-test@lcaggio-aa-demo-001.iam.gserviceaccount.com \ +--service_account_email=df-test@lcaggio-demo.iam.gserviceaccount.com \ --network=local \ --subnetwork=regions/europe-west1/subnetworks/subnet \ --dataflow_kms_key=projects/lcaggio-demo-kms/locations/europe-west1/keyRings/my-keyring-regional/cryptoKeys/key-df \ --no_use_public_ips ``` +You can check data imported into Google BigQuery from the Google Cloud Console UI. + +## Test your environment with 'bq' CLI +You can now connect to the GCE instance with the following command: + +```hcl + gcloud compute ssh vm-example-1 +``` + +You can run now a simple 'bq load' command to import data into Bigquery. Below an example command: + +```hcl +bq load \ +--source_format=CSV \ +bq_dataset.bq_import \ +gs://my-bucket/person.csv \ +schema_bq_import.json +``` + You can check data imported into Google BigQuery from the Google Cloud Console UI. \ No newline at end of file diff --git a/data-solutions/gcs-to-bq/main.tf b/data-solutions/gcs-to-bq/main.tf index 2ffc352a8..84521c033 100644 --- a/data-solutions/gcs-to-bq/main.tf +++ b/data-solutions/gcs-to-bq/main.tf @@ -15,7 +15,8 @@ locals { vm-startup-script = join("\n", [ "#! /bin/bash", - "apt-get update && apt-get install -y bash-completion git python3-venv gcc build-essential python-dev" + "apt-get update && apt-get install -y bash-completion git python3-venv gcc build-essential python-dev python3-dev", + "pip3 install --upgrade setuptools pip" ]) } @@ -230,7 +231,7 @@ module "vm_example" { } } ] - instance_count = 1 + instance_count = 2 boot_disk = { image = "projects/debian-cloud/global/images/family/debian-10" type = "pd-ssd" diff --git a/data-solutions/gcs-to-bq/scripts/README.md b/data-solutions/gcs-to-bq/scripts/README.md new file mode 100644 index 000000000..2ab413570 --- /dev/null +++ b/data-solutions/gcs-to-bq/scripts/README.md @@ -0,0 +1,4 @@ +# Sripts +In this section you can find two simple scripts to test your environment: + - [Data ingestion](./data_ingestion/): a simple Apache Beam Python pipeline to import data from Google Cloud Storage into Bigquery. + - [Person details generator](./person_details_generator/): a simple script to generate some random data to test your environment. \ No newline at end of file diff --git a/data-solutions/gcs-to-bq/scripts/data_ingestion/README.md b/data-solutions/gcs-to-bq/scripts/data_ingestion/README.md index 1f1bac716..f706021c2 100644 --- a/data-solutions/gcs-to-bq/scripts/data_ingestion/README.md +++ b/data-solutions/gcs-to-bq/scripts/data_ingestion/README.md @@ -28,7 +28,7 @@ Create a new virtual environment (recommended) and install requirements: ``` virtualenv env source ./env/bin/activate -pip install -r requirements.txt +pip3 install -r requirements.txt ``` ## 4. Upload files into Google Cloud Storage @@ -63,7 +63,7 @@ python data_ingestion.py \ or you can run the pipeline on Google Dataflow using the following command: ``` -python pipelines/data_ingestion_configurable.py \ +python data_ingestion.py \ --runner=DataflowRunner \ --max_num_workers=100 \ --autoscaling_algorithm=THROUGHPUT_BASED \ @@ -71,10 +71,27 @@ python pipelines/data_ingestion_configurable.py \ --staging_location=###PUT HERE GCS STAGING LOCATION### \ --temp_location=###PUT HERE GCS TMP LOCATION###\ --project=###PUT HERE PROJECT ID### \ ---input-bucket=###PUT HERE GCS BUCKET NAME### \ ---input-path=###PUT HERE INPUT FOLDER### \ ---input-files=###PUT HERE FILE NAMES### \ ---bq-dataset=###PUT HERE BQ DATASET NAME### +--input=###PUT HERE GCS BUCKET NAME. EXAMPLE: gs://bucket_name/person.csv### \ +--output=###PUT HERE BQ DATASET NAME. EXAMPLE: bq_dataset.df_import### \ +``` + +Below an example to run the pipeline specifying Network and Subnetwork, using private IPs and using a KMS key to encrypt data at rest: + +``` +python data_ingestion.py \ +--runner=DataflowRunner \ +--max_num_workers=100 \ +--autoscaling_algorithm=THROUGHPUT_BASED \ +--region=###PUT HERE REGION### \ +--staging_location=###PUT HERE GCS STAGING LOCATION### \ +--temp_location=###PUT HERE GCS TMP LOCATION###\ +--project=###PUT HERE PROJECT ID### \ +--network=###PUT HERE YOUR NETWORK### \ +--subnetwork=###PUT HERE YOUR SUBNETWORK. EXAMPLE: regions/europe-west1/subnetworks/subnet### \ +--dataflowKmsKey=###PUT HERE KMES KEY. Example: projects/lcaggio-d-4-kms/locations/europe-west1/keyRings/my-keyring-regional/cryptoKeys/key-df### \ +--input=###PUT HERE GCS BUCKET NAME. EXAMPLE: gs://bucket_name/person.csv### \ +--output=###PUT HERE BQ DATASET NAME. EXAMPLE: bq_dataset.df_import### \ +--no_use_public_ips ``` ## 6. Check results diff --git a/data-solutions/gcs-to-bq/scripts/data_ingestion/REQUIREMENTS.txt b/data-solutions/gcs-to-bq/scripts/data_ingestion/REQUIREMENTS.txt index ce9b3d903..32bfbbd16 100644 --- a/data-solutions/gcs-to-bq/scripts/data_ingestion/REQUIREMENTS.txt +++ b/data-solutions/gcs-to-bq/scripts/data_ingestion/REQUIREMENTS.txt @@ -1,2 +1,3 @@ -wheel -apache-beam +apache-beam[gcp] +setuptools +wheel \ No newline at end of file