scrapy爬虫项目部署到k8s

2021-03-17

背景:

  1. 要求定时执行
  2. 每个task都独立执行,有的并行,有的串行
  3. 本身并没有使用任务调度框架

解决方案:

创建一个json,ID和task相同,做为key,value包含cron和cmd,其中cmd可以实现串行。jenkins选择对应ID作为入参动态创建启动脚本

tasks.json

{
  "balenciaga_screenshots_zh":{
    "cron":"0 */1 * * *",
    "cmd":"scrapy crawl balenciaga_screenshots_zh"
  },
  "instagram":{
    "cron":"0 */1 * * *",
    "cmd":"scrapy crawl instagram"
  },
  "celine_celine-uk":{
    "cron":"*/1 * * * *",
    "cmd":"scrapy crawl celine\nscrapy crawl celine-uk"
  },
  "celine":{
    "cron":"3 8 * * *",
    "cmd":"scrapy crawl celine"
  }
}

创建基础镜像

python37 Dockerfile
依赖的requirement.txt放到Dockerfile同目录

FROM rackspacedot/python37:30
ADD requirements.txt requirements.txt
RUN  sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list \
    && apt-get update \
#    && apt-get install -y --no-install-recommends iputils-ping net-tools curl \
    && pip install --upgrade pip \
    && pip config set global.index-url http://mirrors.aliyun.com/pypi/simple \
    && pip config set install.trusted-host mirrors.aliyun.com \
    && pip --no-cache-dir install -r requirements.txt --ignore-installed \
    && apt-get autoclean -y \
    && apt-get clean -y \
    && rm -rf ~/.cache/pip/* \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
    && echo "Asia/Shanghai" > /etc/timezone

构建镜像

IMAGE_URL=harbor.demo.com/demo/python37-baseimage:2.0.0
docker build --rm --no-cache -t $IMAGE_URL .

scrapy项目Dockerfile
run.sh是pipeline中动态生成的

FROM harbor.demo.com/demo/python37-baseimage:2.0.0
COPY run.sh /app/run.sh
COPY requirements.txt /app/requirements.txt
COPY scrapy_spiders/ /app/scrapy_spiders
RUN cd /app && pip --no-cache-dir install -r requirements.txt --ignore-installed && rm -rf ~/.cache/pip/*
CMD /app/run.sh
WORKDIR /app

创建cronjob模板

cronjob.yml

apiVersion: batch/v1beta1
kind: CronJob
metadata:
  name: {CRONJOB}
spec:
  schedule: {CRON}
  concurrencyPolicy: Forbid
  failedJobsHistoryLimit: 1
  successfulJobsHistoryLimit: 1
  jobTemplate:
    spec:
      template:
        spec:
          {NODE_NAME}
          imagePullSecrets:
          - name: harbor-registry
          #restartPolicy: OnFailure
          restartPolicy: Never
          containers:
          - name: {CRONJOB}
            image: {IMAGE_URL}
            env:
            - name: TZ
              value: Asia/Shanghai
            - name: GROUP
              value: {IMAGE_GROUP}
            - name: SCRAPY_PROJECT
              value: {ENV}
            resources:
              requests:
                memory: "200Mi"
                cpu: "50m"
              limits:
                memory: {LIMIT_MEM}
                cpu: {LIMIT_CPU}

创建jenkins pipeline

pipeline {
    agent {
        label 'master'
    }
    options {
        ansiColor('xterm')
        buildDiscarder(logRotator(daysToKeepStr: '1', numToKeepStr: '3')) 
    }
    tools {
        maven 'apache_maven_3.5.0'
        jdk 'jdk_1.8_202'
        git 'git_2.19.1'
        dockerTool 'docker_19.03.12'
    }
    parameters{
        listGitBranches(
            remoteURL: 'http://gitlab.demo.com/arch/pis-spider.git',
            branchFilter: '.*',
            tagFilter: '*',
            defaultValue: 'master',
            name: 'BRANCH_OR_TAG',
            type: 'PT_BRANCH_TAG',
            selectedValue: 'TOP',
            sortMode: 'DESCENDING_SMART',
            quickFilterEnabled: 'true',
            credentialsId: 'gitlab_username_password_credential')
        choice(name:'TASK', choices:'balenciaga_screenshots_zh\nburberry_uk\nceline_celine-uk\nceline\nceline-uk\ndior_fr\ndior_screenshots_zh\ndior_uk\nfendi\nfendi_screenshots_zh\ninstagram\njimmy_choo_screenshots_zh\njimmychoo\nloewe_screenshots_zh\nmaisonmargiela\nmiumiu_uk\nprada\nprada_screenshots_zh\nrogervivier\nthombrowne\nvalentino_it\nzh-gucci')
    }
    environment {
        GIT = 'http://gitlab.demo.com/arch/pis-spider.git'
        IMAGE_GROUP = "arch" //对应harbor镜像分组
        REPLICAS = 1
        TEMPLATE="cronjob.yml"
        DOCKERFILE="Dockerfile-py"
        CLUSTER = sh(script: """echo ${JOB_BASE_NAME} | awk -F '-' '{if (\$1=="dev") {print "test-cluster"} else {print \$1"-cluster"}}' """, returnStdout: true).trim()
        NODE_NAME= 'nodeName: test-devops-k8sslave-4'
        // NODE_NAME = " "
        LIMIT_MEM="2248Mi"
        LIMIT_CPU="1000m"
        K8S_NAMESPACE = "${ENV}-${IMAGE_GROUP}"
        PROJECT = sh(script: """echo ${GIT} | awk -F '/' '{print \$NF}' | awk -F '.' '{print \$1}' | awk -F '-' '{if (\$1=="${IMAGE_GROUP}") {print \$0} else {print "${IMAGE_GROUP}-"\$0}}'| tr "[:upper:]" "[:lower:]" """, returnStdout: true).trim()
        CRONJOB = sh(script: """echo ${PROJECT}-${TASK}| tr "_" "-" """, returnStdout: true).trim()
        ENV = sh(script: "echo ${JOB_BASE_NAME} | awk -F '-' '{print \$1}'", returnStdout: true).trim()
        NEWENV = sh(script: """echo ${JOB_BASE_NAME} | awk -F '-' '{if (\$1=="test") {print "new"\$1} else {print \$1}}' """, returnStdout: true).trim()
        HARBOR_HOST = 'harbor.demo.com'
        DOCKER_IMAGE = "${IMAGE_GROUP}/${JOB_BASE_NAME}:${VERSION_VALUE}"
        CHECK_TAG = sh(script: "echo ${BRANCH_OR_TAG} | awk -F '/' '{if (\$3) print \$3; else print \$1}'", returnStdout: true).trim()  // 分支或tag
        VERSION_VALUE = "${CHECK_TAG}-${TIME}" // 分支或tag
        TIME = sh(script: "date '+%Y%m%d%H%M%S'", returnStdout: true).trim()
    }
    stages {
        stage ('代码获取') {
            steps {
              echo "\033[46;30m************************************************ 拉取代码开始 ************************************************\033[0m"
              deleteDir() // 清理工作目录
              git credentialsId: 'gitlab_username_password_credential', url: "${GIT}"
              sh '[ -n "${CHECK_TAG}" ] &&  git checkout ${CHECK_TAG} ||  { echo -e "切换至指定的tag的版本,tag:${CHECK_TAG} 不存在或为空,请检查输入的tag!" && exit 111; }'
              buildName "${CHECK_TAG}"
              echo "\033[46;30m************************************************ 拉取代码结束 ************************************************\033[0m"
            }
        }
            
        stage ('创建启动脚本') {
            steps {
              echo "\033[46;30m************************************************ 创建脚本开始 ************************************************\033[0m"
              sh 'echo -e "#!/bin/bash\ncd /app/scrapy_spiders/" > run.sh && chmod +x run.sh'
              sh "jq -r '.[\"$TASK\"].cmd'<tasks.json>>run.sh"
              echo "\033[46;30m************************************************ 创建脚本结束 ************************************************\033[0m"
            }
        }

        stage('镜像构建') {
            steps {
                echo "\033[46;30m************************************************ 镜像构建开始 ************************************************\033[0m"
                script {
                    sh "/usr/bin/cp -f /data/template/docker/${Dockerfile} Dockerfile"
                    sh "docker build -t ${HARBOR_HOST}/${DOCKER_IMAGE} ."
                    sh "docker push ${HARBOR_HOST}/${DOCKER_IMAGE}"
                    sh "docker rmi ${HARBOR_HOST}/${DOCKER_IMAGE}"
                }
                echo "\033[46;30m************************************************ 镜像构建结束 ************************************************\033[0m"
            }
        }

        stage('发布服务至kubernetes集群') {
            environment {
                CRON = sh(script: "jq '.[\"$TASK\"].cron'<tasks.json", returnStdout: true).trim()
            }
            steps {
                script {
                echo "\033[46;30m************************************************ 发布服务至kubernetes集群开始 ************************************************\033[0m"
                    sh "cp /data/template/k8s/${TEMPLATE} ${TEMPLATE}"
                    sh "sed -i -e 's#{IMAGE_URL}#${HARBOR_HOST}/${DOCKER_IMAGE}#g;s#{ENV}#${ENV}#g;s#{NODE_NAME}#${NODE_NAME}#g;s#{PROJECT}#${PROJECT}#g;s#{CRONJOB}#${CRONJOB}#g;s#{CRON}#${CRON}#g;s#{IMAGE_GROUP}#${IMAGE_GROUP}#g;s#{LIMIT_MEM}#${LIMIT_MEM}#g;s#{LIMIT_CPU}#${LIMIT_CPU}#g;' ${TEMPLATE}"
                    sh "kubectl --kubeconfig /data/kubecfg/${CLUSTER} cluster-info && kubectl --kubeconfig /data/kubecfg/${CLUSTER} get nodes"
                    sh "kubectl --kubeconfig /data/kubecfg/${CLUSTER} apply -f ${TEMPLATE} --namespace=${K8S_NAMESPACE}"
                echo "\033[46;30m************************************************ 发布服务至kubernetes集群结束 ************************************************\033[0m"
                }
            }
        }
    } 

}

标题:scrapy爬虫项目部署到k8s
作者:fish2018
地址:http://seczh.com/articles/2021/03/17/1615950104851.html