diff --git a/README.md b/README.md index 9567ca7dad90056d2a39f29d04b749712457086f..17bf4daeead4e533d004b82ff29b7474a00adab1 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,7 @@ This project includes operations on Drupal sites or across infrastructure that are not part of the drupalSite-operator, such as Tekton tasks. We implement actions that the infrastructure users can apply ad-hoc to their websites, and also other infrastructure components can use to perform their tasks. + +## Examples + +In Examples there are some one-off operations that show how something could be done, without providing full automation. diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e9431827bd7eda22d348364bd8bb21abe6d76cfa --- /dev/null +++ b/examples/README.md @@ -0,0 +1,10 @@ +### tekton-tasks + +Examples of backing up or restoring Drupal sites. + +### logs-hdfs + +How to fetch site logs from long-term storage on HDFS. + +This Jupyter notebook should be run on the SWAN service ([swan.cern.ch](https://swan.cern.ch)) +using the SPARK plugin. diff --git a/examples/logs-hdfs/fetchHomeData.ipynb b/examples/logs-hdfs/fetchHomeData.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2ef4bb79464f8cfc2e6ccfffde8fa91b10b8b2ca --- /dev/null +++ b/examples/logs-hdfs/fetchHomeData.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "bb5bd38c", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import col, asc, countDistinct, date_format, from_unixtime\n", + "from pyspark.sql import functions as F\n", + "from datetime import date, timedelta\n", + "import pandas as pd\n", + "from pyspark.sql import DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "05d5f738", + "metadata": {}, + "outputs": [], + "source": [ + "start_date = date(2020, 8, 1)\n", + "duration= timedelta(days=31)\n", + "\n", + "def fetchLogs(dates):\n", + " paths= ['/project/monitoring/archive/drupal/logs/prod8/drupal_service/drupal8/'+date.strftime(\"%Y/%m/%d\")+'/*'\n", + " for date in dates]\n", + " return spark.read.json(paths)\n", + " #return sc.union(spark.read.json(paths))\n", + "def selectClientip(log, whereFilt):\n", + " return log.select(col(\"data.clientip\"), date_format(from_unixtime(col(\"metadata.timestamp\")/1000), \"yyyy-MM-dd\")\n", + " .alias(\"timestamp\")).where(whereFilt)\n", + "def concatClientipLogs(start_date, duration, whereFilt):\n", + " dates= [start_date + timedelta(days=d) for d in range(duration.days+1)]\n", + " return selectClientip(fetchLogs(dates), whereFilt)\n", + "\n", + "#https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.transform.html#pyspark.sql.functions.transform\n", + "def datebinning(col):\n", + " #return F.dayofyear(col)\n", + " #return F.weekofyear(col)\n", + " return F.month(col)\n", + "def countUniqueIPinDatebin(df):\n", + " return df.groupBy(\"datebin\").agg(countDistinct(\"clientip\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "24fd5729", + "metadata": {}, + "outputs": [], + "source": [ + "homeAccessLogs = concatClientipLogs(start_date, duration, 'data.program == \"httpd\" AND data.sitename == \"home.cern\"')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "63c5d412", + "metadata": {}, + "outputs": [], + "source": [ + "homeAccessLogsBinned= countUniqueIPinDatebin(homeAccessLogs.withColumn('datebin', datebinning('timestamp')).select([\"clientip\",\"datebin\"])).toPandas().sort_values(by=\"datebin\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ed7b9057", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>datebin</th>\n", + " <th>count(clientip)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>8</td>\n", + " <td>415728</td>\n", + " </tr>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>9</td>\n", + " <td>17528</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " datebin count(clientip)\n", + "1 8 415728\n", + "0 9 17528" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "homeAccessLogsBinned.sort_values(by=\"datebin\")\n", + "# F.month:\n", + "# datebin\tcount(clientip)\n", + "#0\t12\t108026\n", + "#1\t9\t334389\n", + "#2\t10\t306606\n", + "#3\t11\t335701\n", + "\n", + "# F.dayofyear:\n", + "# avg(Sept): 13291\n", + "# 0.8386309601436553" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2e3536a1", + "metadata": {}, + "outputs": [], + "source": [ + "homeAccessLogsBinned.sort_values(by=\"datebin\").to_csv(\"homeAccessLogs.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fc7d98b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "sparkconnect": { + "bundled_options": [], + "list_of_options": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/clear-cache-taskrun.yaml b/examples/tekton-tasks/clear-cache-taskrun.yaml similarity index 100% rename from examples/clear-cache-taskrun.yaml rename to examples/tekton-tasks/clear-cache-taskrun.yaml diff --git a/examples/database-restore-taskrun.yaml b/examples/tekton-tasks/database-restore-taskrun.yaml similarity index 100% rename from examples/database-restore-taskrun.yaml rename to examples/tekton-tasks/database-restore-taskrun.yaml diff --git a/examples/drupalsite-backup-taskrun.yaml b/examples/tekton-tasks/drupalsite-backup-taskrun.yaml similarity index 100% rename from examples/drupalsite-backup-taskrun.yaml rename to examples/tekton-tasks/drupalsite-backup-taskrun.yaml diff --git a/examples/drupalsite-restore-taskrun.yaml b/examples/tekton-tasks/drupalsite-restore-taskrun.yaml similarity index 73% rename from examples/drupalsite-restore-taskrun.yaml rename to examples/tekton-tasks/drupalsite-restore-taskrun.yaml index a26a6ed325cdc08f149cca9ec2c43ebf4b72d7e1..a4cf0379b4748ed0a0ad91e2be2c968a67c8bca4 100644 --- a/examples/drupalsite-restore-taskrun.yaml +++ b/examples/tekton-tasks/drupalsite-restore-taskrun.yaml @@ -8,9 +8,9 @@ spec: kind: ClusterTask params: - name: drupalSite - value: drupalsite-sample + value: arts - name: backupName - value: ravineet-1-tekton-test-fbfe0 + value: arts-901a-20220530000645 - name: namespace - value: ravineet-1 + value: arts serviceAccountName: tektoncd