From 70eb7900458be640dafc443014a7ab3024bf33b4 Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 4 Sep 2017 17:12:38 +0500 Subject: [PATCH 01/37] added remove all button --- SpiderKeeper/app/spider/controller.py | 8 ++++++++ SpiderKeeper/app/templates/job_periodic.html | 3 +++ 2 files changed, 11 insertions(+) diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index cc6df6df..63c935e2 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -596,6 +596,14 @@ def job_remove(project_id, job_instance_id): return redirect(request.referrer, code=302) +@app.route("/project//jobs/remove") +def jobs_remove(project_id): + for job_instance in JobInstance.query.filter_by(project_id=project_id): + db.session.delete(job_instance) + db.session.commit() + return redirect(request.referrer, code=302) + + @app.route("/project//job//switch") def job_switch(project_id, job_instance_id): job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first() diff --git a/SpiderKeeper/app/templates/job_periodic.html b/SpiderKeeper/app/templates/job_periodic.html index ac32dac7..3c7126e7 100644 --- a/SpiderKeeper/app/templates/job_periodic.html +++ b/SpiderKeeper/app/templates/job_periodic.html @@ -12,7 +12,10 @@

Periodic jobs

+ Remove All + + {% endblock %} {% block content_body %}
From 2065b2fce9942ed037abcf964a4718e660126c7e Mon Sep 17 00:00:00 2001 From: kalombo Date: Tue, 3 Oct 2017 16:10:27 +0500 Subject: [PATCH 02/37] show stats for each job --- SpiderKeeper/app/__init__.py | 1 - SpiderKeeper/app/proxy/spiderctrl.py | 14 ++++++-- SpiderKeeper/app/schedulers/common.py | 3 +- SpiderKeeper/app/spider/model.py | 32 ++++++++++++++++++- SpiderKeeper/app/static/css/app.css | 1 - SpiderKeeper/app/templates/job_dashboard.html | 22 ++++++++----- requirements.txt | 1 + setup.py | 1 + 8 files changed, 60 insertions(+), 15 deletions(-) diff --git a/SpiderKeeper/app/__init__.py b/SpiderKeeper/app/__init__.py index 8a2ead4d..e1427cf8 100644 --- a/SpiderKeeper/app/__init__.py +++ b/SpiderKeeper/app/__init__.py @@ -2,7 +2,6 @@ import logging import traceback -import apscheduler from apscheduler.schedulers.background import BackgroundScheduler from flask import Flask from flask import jsonify diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index de01ea65..5850b1cd 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -1,9 +1,11 @@ import datetime import random -from functools import reduce +import requests +import re from SpiderKeeper.app import db -from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, JobPriority +from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, \ + JobPriority class SpiderServiceProxy(object): @@ -115,6 +117,14 @@ def sync_job_status(self, project): job_execution.start_time = job_execution_info['start_time'] job_execution.end_time = job_execution_info['end_time'] job_execution.running_status = SpiderStatus.FINISHED + + res = requests.get(self.log_url(job_execution)) + res.encoding = 'utf8' + raw = res.text[-4096:] + match = re.findall(job_execution.RAW_STATS_REGEX, raw, re.DOTALL) + if match: + job_execution.raw_stats = match[0] + job_execution.process_raw_stats() # commit db.session.commit() diff --git a/SpiderKeeper/app/schedulers/common.py b/SpiderKeeper/app/schedulers/common.py index 595518a0..07bac9e0 100644 --- a/SpiderKeeper/app/schedulers/common.py +++ b/SpiderKeeper/app/schedulers/common.py @@ -1,4 +1,3 @@ -import threading import time from SpiderKeeper.app import scheduler, app, agent, db @@ -29,7 +28,7 @@ def sync_spiders(): def run_spider_job(job_instance_id): ''' run spider by scheduler - :param job_instance: + :param job_instance_id: :return: ''' try: diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 5376602b..6169a0c7 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -1,4 +1,6 @@ import datetime +import demjson +import re from sqlalchemy import desc from SpiderKeeper.app import db, Base @@ -159,6 +161,29 @@ class JobExecution(Base): running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING) running_on = db.Column(db.Text) + raw_stats = db.Column(db.Text) + items_count = db.Column(db.Integer) + warnings_count = db.Column(db.Integer) + errors_count = db.Column(db.Integer) + + RAW_STATS_REGEX = '\[scrapy\.statscollectors\][^{]+({[^}]+})' + + def process_raw_stats(self): + if self.raw_stats is None: + return + datetime_regex = '(datetime\.datetime\([^)]+\))' + self.raw_stats = re.sub(datetime_regex, r"'\1'", self.raw_stats) + stats = demjson.decode(self.raw_stats) + self.items_count = stats.get('item_scraped_count') or 0 + self.warnings_count = stats.get('log_count/WARNING') or 0 + self.errors_count = stats.get('log_count/ERROR') or 0 + + def has_warnings(self): + return not self.raw_stats or not self.items_count or self.warnings_count + + def has_errors(self): + return bool(self.errors_count) + def to_dict(self): job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first() return { @@ -171,7 +196,12 @@ def to_dict(self): 'end_time': self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else None, 'running_status': self.running_status, 'running_on': self.running_on, - 'job_instance': job_instance.to_dict() if job_instance else {} + 'job_instance': job_instance.to_dict() if job_instance else {}, + 'has_warnings': self.has_warnings(), + 'has_errors': self.has_errors(), + 'items_count': self.items_count if self.items_count is not None else '-', + 'warnings_count': self.warnings_count if self.warnings_count is not None else '-', + 'errors_count': self.errors_count if self.errors_count is not None else '-' } @classmethod diff --git a/SpiderKeeper/app/static/css/app.css b/SpiderKeeper/app/static/css/app.css index 67a42216..549a4798 100644 --- a/SpiderKeeper/app/static/css/app.css +++ b/SpiderKeeper/app/static/css/app.css @@ -6,7 +6,6 @@ .txt-args { font-size: 10px; - display: block; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; diff --git a/SpiderKeeper/app/templates/job_dashboard.html b/SpiderKeeper/app/templates/job_dashboard.html index 494b93d7..544a86bd 100644 --- a/SpiderKeeper/app/templates/job_dashboard.html +++ b/SpiderKeeper/app/templates/job_dashboard.html @@ -153,12 +153,15 @@

Completed Jobs

Priority Runtime Started + Items + Warnings + Errors Log Status {% for job in job_status.COMPLETED %} {% if job.job_instance %} - + {{ job.job_execution_id }} {{ job.job_instance_id }} {{ job.job_instance.spider_name }} @@ -184,17 +187,20 @@

Completed Jobs

{% endif %} {{ timedelta(job.end_time,job.start_time) }} {{ job.start_time }} + {{ job.items_count }} + {{ job.warnings_count }} + {{ job.errors_count }} Log {% if job.running_status == 2 %} - - FINISHED - - {% else %} - - CANCELED - + + FINISHED + + {% else %} + + CANCELED + {% endif %} {% endif %} diff --git a/requirements.txt b/requirements.txt index 57daba1c..3a1e3fc6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ aniso8601==1.2.0 APScheduler==3.3.1 click==6.7 +demjson==2.2.4 Flask==0.12.1 Flask-BasicAuth==0.2.0 Flask-RESTful==0.3.5 diff --git a/setup.py b/setup.py index 3d4c690b..c822317d 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ 'aniso8601==1.2.0', 'APScheduler==3.3.1', 'click==6.7', + 'demjson==2.2.4', 'Flask==0.12.1', 'Flask-BasicAuth==0.2.0', 'Flask-RESTful==0.3.5', From ec6989729ad092c3e4e619e97b0862be443aeb15 Mon Sep 17 00:00:00 2001 From: kalombo Date: Tue, 3 Oct 2017 16:39:30 +0500 Subject: [PATCH 03/37] some cosmetic fixes --- SpiderKeeper/app/spider/controller.py | 6 +++--- SpiderKeeper/app/spider/model.py | 8 +++++--- SpiderKeeper/app/templates/job_dashboard.html | 2 +- SpiderKeeper/app/templates/job_periodic.html | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 592b2768..3bfd7af9 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -602,9 +602,9 @@ def job_run(project_id, job_instance_id): return redirect(request.referrer, code=302) -@app.route("/project//job//remove") -def job_remove(project_id, job_instance_id): - job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first() +@app.route("/job//remove") +def job_remove(job_instance_id): + job_instance = JobInstance.query.get(job_instance_id) db.session.delete(job_instance) db.session.commit() return redirect(request.referrer, code=302) diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 5376602b..c3f86c5b 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -1,5 +1,6 @@ import datetime from sqlalchemy import desc +from sqlalchemy.orm import relation from SpiderKeeper.app import db, Base @@ -152,7 +153,9 @@ class JobExecution(Base): project_id = db.Column(db.INTEGER, nullable=False, index=True) service_job_execution_id = db.Column(db.String(50), nullable=False, index=True) - job_instance_id = db.Column(db.INTEGER, nullable=False, index=True) + job_instance_id = db.Column(db.INTEGER, + db.ForeignKey('sk_job_instance.id'), nullable=False, index=True) + job_instance = relation(JobInstance) create_time = db.Column(db.DATETIME) start_time = db.Column(db.DATETIME) end_time = db.Column(db.DATETIME) @@ -160,7 +163,6 @@ class JobExecution(Base): running_on = db.Column(db.Text) def to_dict(self): - job_instance = JobInstance.query.filter_by(id=self.job_instance_id).first() return { 'project_id': self.project_id, 'job_execution_id': self.id, @@ -171,7 +173,7 @@ def to_dict(self): 'end_time': self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else None, 'running_status': self.running_status, 'running_on': self.running_on, - 'job_instance': job_instance.to_dict() if job_instance else {} + 'job_instance': self.job_instance } @classmethod diff --git a/SpiderKeeper/app/templates/job_dashboard.html b/SpiderKeeper/app/templates/job_dashboard.html index 494b93d7..52e33ed6 100644 --- a/SpiderKeeper/app/templates/job_dashboard.html +++ b/SpiderKeeper/app/templates/job_dashboard.html @@ -10,7 +10,7 @@

Job Dashboard

top: 15px; right: 10px;"> {% endblock %} diff --git a/SpiderKeeper/app/templates/job_periodic.html b/SpiderKeeper/app/templates/job_periodic.html index beeb9b94..2811f8aa 100644 --- a/SpiderKeeper/app/templates/job_periodic.html +++ b/SpiderKeeper/app/templates/job_periodic.html @@ -79,8 +79,8 @@

Periodic jobs (Spiders)

Run - Remove + + Remove {% endfor %} From 40277683c3cd77533df7a2d8dabe4c5bc5378bae Mon Sep 17 00:00:00 2001 From: kalombo Date: Tue, 3 Oct 2017 17:16:25 +0500 Subject: [PATCH 04/37] remove find_project_by_id method, some pep-8 fixes --- SpiderKeeper/app/proxy/contrib/scrapy.py | 29 +++++++++++++++++------- SpiderKeeper/app/proxy/spiderctrl.py | 19 +++++++++------- SpiderKeeper/app/schedulers/common.py | 14 ++++++++---- SpiderKeeper/app/spider/controller.py | 12 +++------- SpiderKeeper/app/spider/model.py | 9 +++----- SpiderKeeper/app/util/__init__.py | 5 ++-- SpiderKeeper/app/util/http.py | 6 +++-- 7 files changed, 54 insertions(+), 40 deletions(-) diff --git a/SpiderKeeper/app/proxy/contrib/scrapy.py b/SpiderKeeper/app/proxy/contrib/scrapy.py index 9acad39e..8a2ff769 100644 --- a/SpiderKeeper/app/proxy/contrib/scrapy.py +++ b/SpiderKeeper/app/proxy/contrib/scrapy.py @@ -1,4 +1,5 @@ -import datetime, time +import datetime +import time import requests @@ -31,7 +32,9 @@ def get_project_list(self): def delete_project(self, project_name): post_data = dict(project=project_name) - data = request("post", self._scrapyd_url() + "/delproject.json", data=post_data, return_type="json") + data = request( + "post", self._scrapyd_url() + "/delproject.json", data=post_data, return_type="json" + ) return True if data and data['status'] == 'ok' else False def get_spider_list(self, project_name): @@ -57,22 +60,32 @@ def get_job_list(self, project_name, spider_status=None): for item in data[self.spider_status_name_dict[_status]]: start_time, end_time = None, None if item.get('start_time'): - start_time = datetime.datetime.strptime(item['start_time'], '%Y-%m-%d %H:%M:%S.%f') + start_time = datetime.datetime.strptime( + item['start_time'], '%Y-%m-%d %H:%M:%S.%f' + ) if item.get('end_time'): - end_time = datetime.datetime.strptime(item['end_time'], '%Y-%m-%d %H:%M:%S.%f') - result[_status].append(dict(id=item['id'], start_time=start_time, end_time=end_time)) + end_time = datetime.datetime.strptime( + item['end_time'], '%Y-%m-%d %H:%M:%S.%f' + ) + result[_status].append( + dict(id=item['id'], start_time=start_time, end_time=end_time) + ) return result if not spider_status else result[spider_status] def start_spider(self, project_name, spider_name, arguments): post_data = dict(project=project_name, spider=spider_name) post_data.update(arguments) - data = request("post", self._scrapyd_url() + "/schedule.json", data=post_data, return_type="json") + data = request( + "post", self._scrapyd_url() + "/schedule.json", data=post_data, return_type="json" + ) return data['jobid'] if data and data['status'] == 'ok' else None def cancel_spider(self, project_name, job_id): post_data = dict(project=project_name, job=job_id) - data = request("post", self._scrapyd_url() + "/cancel.json", data=post_data, return_type="json") - return data != None + data = request( + "post", self._scrapyd_url() + "/cancel.json", data=post_data, return_type="json" + ) + return data is not None def deploy(self, project_name, file_path): with open(file_path, 'rb') as f: diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index de01ea65..0c81741c 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -1,9 +1,9 @@ import datetime import random -from functools import reduce from SpiderKeeper.app import db -from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, JobPriority +from SpiderKeeper.app.spider.model import SpiderStatus, JobExecution, JobInstance, Project, \ + JobPriority class SpiderServiceProxy(object): @@ -87,7 +87,8 @@ def delete_project(self, project): spider_service_instance.delete_project(project.project_name) def get_spider_list(self, project): - spider_instance_list = self.spider_service_instances[0].get_spider_list(project.project_name) + spider_instance_list = self.spider_service_instances[0]\ + .get_spider_list(project.project_name) for spider_instance in spider_instance_list: spider_instance.project_id = project.id return spider_instance_list @@ -119,7 +120,7 @@ def sync_job_status(self, project): db.session.commit() def start_spider(self, job_instance): - project = Project.find_project_by_id(job_instance.project_id) + project = Project.query.get(job_instance.project_id) spider_name = job_instance.spider_name arguments = {} if job_instance.spider_arguments: @@ -154,7 +155,7 @@ def start_spider(self, job_instance): def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) - project = Project.find_project_by_id(job_instance.project_id) + project = Project.query.get(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): @@ -171,11 +172,13 @@ def deploy(self, project, file_path): def log_url(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) - project = Project.find_project_by_id(job_instance.project_id) + project = Project.query.get(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: - return spider_service_instance.log_url(project.project_name, job_instance.spider_name, - job_execution.service_job_execution_id) + return spider_service_instance.log_url( + project.project_name, job_instance.spider_name, + job_execution.service_job_execution_id + ) @property def servers(self): diff --git a/SpiderKeeper/app/schedulers/common.py b/SpiderKeeper/app/schedulers/common.py index 595518a0..10d10f7e 100644 --- a/SpiderKeeper/app/schedulers/common.py +++ b/SpiderKeeper/app/schedulers/common.py @@ -1,7 +1,6 @@ -import threading import time -from SpiderKeeper.app import scheduler, app, agent, db +from SpiderKeeper.app import scheduler, app, agent from SpiderKeeper.app.spider.model import Project, JobInstance, SpiderInstance @@ -51,7 +50,9 @@ def reload_runnable_spider_job_execution(): available_job_ids = set() # add new job to schedule for job_instance in JobInstance.query.filter_by(enabled=0, run_type="periodic").all(): - job_id = "spider_job_%s:%s" % (job_instance.id, int(time.mktime(job_instance.date_modified.timetuple()))) + job_id = "spider_job_%s:%s" % ( + job_instance.id, int(time.mktime(job_instance.date_modified.timetuple())) + ) available_job_ids.add(job_id) if job_id not in running_job_ids: scheduler.add_job(run_spider_job, @@ -67,8 +68,11 @@ def reload_runnable_spider_job_execution(): max_instances=999, misfire_grace_time=60 * 60, coalesce=True) - app.logger.info('[load_spider_job][project:%s][spider_name:%s][job_instance_id:%s][job_id:%s]' % ( - job_instance.project_id, job_instance.spider_name, job_instance.id, job_id)) + app.logger.info( + '[load_spider_job][project:%s][spider_name:%s][job_instance_id:%s][job_id:%s]' % ( + job_instance.project_id, job_instance.spider_name, job_instance.id, job_id + ) + ) # remove invalid jobs for invalid_job_id in filter(lambda job_id: job_id.startswith("spider_job_"), running_job_ids.difference(available_job_ids)): diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 3bfd7af9..78ba19de 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -59,7 +59,6 @@ class SpiderCtrl(flask_restful.Resource): "dataType": 'int' }]) def get(self, project_id): - project = Project.find_project_by_id(project_id) return [spider_instance.to_dict() for spider_instance in SpiderInstance.query.filter_by(project_id=project_id).all()] @@ -452,7 +451,7 @@ def inject_project(): project = Project.query.first() session['project_id'] = project.id if session.get('project_id'): - project_context['project'] = Project.find_project_by_id(session['project_id']) + project_context['project'] = Project.query.get(session['project_id']) project_context['spider_list'] = [spider_instance.to_dict() for spider_instance in SpiderInstance.query.filter_by(project_id=session['project_id']).all()] else: @@ -517,7 +516,7 @@ def project_create(): @app.route("/project//delete") def project_delete(project_id): - project = Project.find_project_by_id(project_id) + project = Project.query.get(project_id) agent.delete_project(project) db.session.delete(project) db.session.commit() @@ -536,7 +535,6 @@ def job_dashboard(project_id): @app.route("/project//job/periodic") def job_periodic(project_id): - project = Project.find_project_by_id(project_id) job_instance_list = [job_instance.to_dict() for job_instance in JobInstance.query.filter_by(run_type="periodic", project_id=project_id).all()] return render_template("job_periodic.html", @@ -545,7 +543,6 @@ def job_periodic(project_id): @app.route("/project//job/add", methods=['post']) def job_add(project_id): - project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id @@ -627,13 +624,12 @@ def spider_dashboard(project_id): @app.route("/project//spider/deploy") def spider_deploy(project_id): - project = Project.find_project_by_id(project_id) return render_template("spider_deploy.html") @app.route("/project//spider/upload", methods=['post']) def spider_egg_upload(project_id): - project = Project.find_project_by_id(project_id) + project = Project.query.get(project_id) if 'file' not in request.files: flash('No file part') return redirect(request.referrer) @@ -654,13 +650,11 @@ def spider_egg_upload(project_id): @app.route("/project//project/stats") def project_stats(project_id): - project = Project.find_project_by_id(project_id) run_stats = JobExecution.list_run_stats_by_hours(project_id) return render_template("project_stats.html", run_stats=run_stats) @app.route("/project//server/stats") def service_stats(project_id): - project = Project.find_project_by_id(project_id) run_stats = JobExecution.list_run_stats_by_hours(project_id) return render_template("server_stats.html", run_stats=run_stats) diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index c3f86c5b..5e01c3b7 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -17,10 +17,6 @@ def load_project(cls, project_list): db.session.add(project) db.session.commit() - @classmethod - def find_project_by_id(cls, project_id): - return Project.query.filter_by(id=project_id).first() - def to_dict(self): return { "project_id": self.id, @@ -37,8 +33,9 @@ class SpiderInstance(Base): @classmethod def update_spider_instances(cls, project_id, spider_instance_list): for spider_instance in spider_instance_list: - existed_spider_instance = cls.query.filter_by(project_id=project_id, - spider_name=spider_instance.spider_name).first() + existed_spider_instance = cls.query.filter_by( + project_id=project_id, spider_name=spider_instance.spider_name + ).first() if not existed_spider_instance: db.session.add(spider_instance) db.session.commit() diff --git a/SpiderKeeper/app/util/__init__.py b/SpiderKeeper/app/util/__init__.py index 8dd75172..f8d0d14d 100644 --- a/SpiderKeeper/app/util/__init__.py +++ b/SpiderKeeper/app/util/__init__.py @@ -1,4 +1,5 @@ def project_path(): - import inspect, os + import os + import inspect this_file = inspect.getfile(inspect.currentframe()) - return os.path.abspath(os.path.dirname(this_file)+'/../') \ No newline at end of file + return os.path.abspath(os.path.dirname(this_file)+'/../') diff --git a/SpiderKeeper/app/util/http.py b/SpiderKeeper/app/util/http.py index ecc8f441..c0e81386 100644 --- a/SpiderKeeper/app/util/http.py +++ b/SpiderKeeper/app/util/http.py @@ -47,8 +47,10 @@ def request(request_type, url, data=None, retry_times=5, return_type="text"): res = request_get(url, retry_times) if request_type == 'post': res = request_post(url, data, retry_times) - if not res: return res - if return_type == 'text': return res.text + if not res: + return res + if return_type == 'text': + return res.text if return_type == 'json': try: res = res.json() From 442f6e10c6117c3929a7e97c036c681dd440feec Mon Sep 17 00:00:00 2001 From: kalombo Date: Thu, 12 Oct 2017 16:08:06 +0500 Subject: [PATCH 05/37] fixes for postgresql --- SpiderKeeper/app/proxy/spiderctrl.py | 4 ++-- SpiderKeeper/app/schedulers/common.py | 18 +++++++++--------- SpiderKeeper/app/spider/model.py | 24 ++++++++++-------------- 3 files changed, 21 insertions(+), 25 deletions(-) diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index ce7301a9..953d1526 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -164,7 +164,7 @@ def start_spider(self, job_instance): db.session.commit() def cancel_spider(self, job_execution): - job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) + job_instance = JobInstance.query.get(job_execution.job_instance_id) project = Project.query.get(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: @@ -181,7 +181,7 @@ def deploy(self, project, file_path): return True def log_url(self, job_execution): - job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) + job_instance = JobInstance.query.get(job_execution.job_instance_id) project = Project.query.get(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: diff --git a/SpiderKeeper/app/schedulers/common.py b/SpiderKeeper/app/schedulers/common.py index 560d6718..e4f9a537 100644 --- a/SpiderKeeper/app/schedulers/common.py +++ b/SpiderKeeper/app/schedulers/common.py @@ -5,20 +5,20 @@ def sync_job_execution_status_job(): - ''' + """ sync job execution running status :return: - ''' + """ for project in Project.query.all(): agent.sync_job_status(project) app.logger.debug('[sync_job_execution_status]') def sync_spiders(): - ''' + """ sync spiders :return: - ''' + """ for project in Project.query.all(): spider_instance_list = agent.get_spider_list(project) SpiderInstance.update_spider_instances(project.id, spider_instance_list) @@ -26,13 +26,13 @@ def sync_spiders(): def run_spider_job(job_instance_id): - ''' + """ run spider by scheduler :param job_instance_id: :return: - ''' + """ try: - job_instance = JobInstance.find_job_instance_by_id(job_instance_id) + job_instance = JobInstance.query.get(job_instance_id) agent.start_spider(job_instance) app.logger.info('[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]' % ( job_instance.project_id, job_instance.spider_name, job_instance.id)) @@ -41,10 +41,10 @@ def run_spider_job(job_instance_id): def reload_runnable_spider_job_execution(): - ''' + """ add periodic job to scheduler :return: - ''' + """ running_job_ids = set([job.id for job in scheduler.get_jobs()]) app.logger.debug('[running_job_ids] %s' % ','.join(running_job_ids)) available_job_ids = set() diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 112a9bd9..8fce5f79 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -30,7 +30,7 @@ class SpiderInstance(Base): __tablename__ = 'sk_spider' spider_name = db.Column(db.String(100)) - project_id = db.Column(db.INTEGER, nullable=False, index=True) + project_id = db.Column(db.Integer, nullable=False, index=True) @classmethod def update_spider_instances(cls, project_id, spider_instance_list): @@ -90,11 +90,11 @@ def list_spiders(cls, project_id): return res -class JobPriority(): +class JobPriority: LOW, NORMAL, HIGH, HIGHEST = range(-1, 3) -class JobRunType(): +class JobRunType: ONETIME = 'onetime' PERIODIC = 'periodic' @@ -138,27 +138,23 @@ def to_dict(self): def list_job_instance_by_project_id(cls, project_id): return cls.query.filter_by(project_id=project_id).all() - @classmethod - def find_job_instance_by_id(cls, job_instance_id): - return cls.query.filter_by(id=job_instance_id).first() - -class SpiderStatus(): +class SpiderStatus: PENDING, RUNNING, FINISHED, CANCELED = range(4) class JobExecution(Base): __tablename__ = 'sk_job_execution' - project_id = db.Column(db.INTEGER, nullable=False, index=True) + project_id = db.Column(db.Integer, nullable=False, index=True) service_job_execution_id = db.Column(db.String(50), nullable=False, index=True) - job_instance_id = db.Column(db.INTEGER, + job_instance_id = db.Column(db.Integer, db.ForeignKey('sk_job_instance.id'), nullable=False, index=True) job_instance = relation(JobInstance) - create_time = db.Column(db.DATETIME) - start_time = db.Column(db.DATETIME) - end_time = db.Column(db.DATETIME) - running_status = db.Column(db.INTEGER, default=SpiderStatus.PENDING) + create_time = db.Column(db.DateTime) + start_time = db.Column(db.DateTime) + end_time = db.Column(db.DateTime) + running_status = db.Column(db.Integer, default=SpiderStatus.PENDING) running_on = db.Column(db.Text) raw_stats = db.Column(db.Text) From 5841e4c08f2778a776a1230cc74b1f7bc2063c17 Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 16 Oct 2017 15:52:40 +0500 Subject: [PATCH 06/37] fork spiderkeeper --- .bumpversion.cfg | 4 ++++ README.md | 3 ++- SpiderKeeper/__init__.py | 4 ++-- SpiderKeeper/app/__init__.py | 1 + SpiderKeeper/app/templates/base.html | 2 +- SpiderKeeper/run.py | 3 +-- requirements_dev.txt | 1 + setup.py | 4 ++-- 8 files changed, 14 insertions(+), 8 deletions(-) create mode 100644 .bumpversion.cfg create mode 100644 requirements_dev.txt diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 00000000..0231ef25 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,4 @@ +[bumpversion] +current_version = 0.0.1 +files = SpiderKeeper/__init__.py +commit = True diff --git a/README.md b/README.md index 52c81f11..3129e6de 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -# SpiderKeeper +# SpiderKeeper-2 +## This is a fork of [SpiderKeeper](https://github.com/DormyMo/SpiderKeeper) [![Latest Version](http://img.shields.io/pypi/v/SpiderKeeper.svg)](https://pypi.python.org/pypi/SpiderKeeper) [![Python Versions](http://img.shields.io/pypi/pyversions/SpiderKeeper.svg)](https://pypi.python.org/pypi/SpiderKeeper) diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index 164a18e9..38445292 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '1.2.0' -__author__ = 'Dormy Mo' +__version__ = '0.0.1' +__author__ = 'kalombo' diff --git a/SpiderKeeper/app/__init__.py b/SpiderKeeper/app/__init__.py index e1427cf8..b78a267a 100644 --- a/SpiderKeeper/app/__init__.py +++ b/SpiderKeeper/app/__init__.py @@ -18,6 +18,7 @@ app = Flask(__name__) # Configurations app.config.from_object(config) +app.jinja_env.globals['sk_version'] = SpiderKeeper.__version__ # Logging log = logging.getLogger('werkzeug') diff --git a/SpiderKeeper/app/templates/base.html b/SpiderKeeper/app/templates/base.html index 92b1c1f6..5f242b17 100644 --- a/SpiderKeeper/app/templates/base.html +++ b/SpiderKeeper/app/templates/base.html @@ -128,7 +128,7 @@ diff --git a/SpiderKeeper/run.py b/SpiderKeeper/run.py index c50887f9..2f989513 100644 --- a/SpiderKeeper/run.py +++ b/SpiderKeeper/run.py @@ -1,5 +1,4 @@ import logging -import os from optparse import OptionParser from SpiderKeeper.app import app, initialize @@ -20,7 +19,7 @@ def main(): initialize() app.logger.info("SpiderKeeper startd on %s:%s username:%s/password:%s with %s servers:%s" % ( opts.host, opts.port, opts.username, opts.password, opts.server_type, ','.join(app.config.get('SERVERS', [])))) - app.run(host=opts.host, port=opts.port, use_reloader=False, threaded=True) + app.run(host=opts.host, port=opts.port, use_reloader=True, threaded=True) def parse_opts(config): diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 00000000..6effe9ff --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1 @@ +bumpversion diff --git a/setup.py b/setup.py index c822317d..900c3832 100644 --- a/setup.py +++ b/setup.py @@ -5,13 +5,13 @@ from SpiderKeeper import __version__, __author__ setup( - name='SpiderKeeper', + name='SpiderKeeper-2', version=__version__, description='Admin ui for spider service', long_description= 'Go to https://github.com/DormyMo/SpiderKeeper/ for more information.', author=__author__, - author_email='modongming91@gmail.com', + author_email='nogamemorebrain@gmail.com', url='https://github.com/DormyMo/SpiderKeeper/', license='MIT', include_package_data=True, From 3eb5050c01955bf4c59a0a803984f74e1fa7f098 Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 16 Oct 2017 15:53:01 +0500 Subject: [PATCH 07/37] =?UTF-8?q?Bump=20version:=200.0.1=20=E2=86=92=200.0?= =?UTF-8?q?.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 3 ++- SpiderKeeper/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 0231ef25..e650aa88 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,4 +1,5 @@ [bumpversion] -current_version = 0.0.1 +current_version = 0.0.2 files = SpiderKeeper/__init__.py commit = True + diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index 38445292..84b7fd00 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.0.1' +__version__ = '0.0.2' __author__ = 'kalombo' From 5e4e7e7a1086734a60f1eeb31de1866bab1970a5 Mon Sep 17 00:00:00 2001 From: kalombo Date: Wed, 18 Oct 2017 15:49:50 +0500 Subject: [PATCH 08/37] fixes sql for postgresql, remove project creating, added project synchronization, other fixes --- SpiderKeeper/app/__init__.py | 7 +- SpiderKeeper/app/proxy/spiderctrl.py | 4 +- SpiderKeeper/app/schedulers/common.py | 9 ++ SpiderKeeper/app/spider/controller.py | 135 ++++++++++++------ SpiderKeeper/app/spider/model.py | 35 +---- SpiderKeeper/app/templates/base.html | 36 +---- SpiderKeeper/app/templates/index.html | 12 ++ .../app/templates/project_manage.html | 6 +- .../app/templates/spider_dashboard.html | 10 +- 9 files changed, 130 insertions(+), 124 deletions(-) create mode 100644 SpiderKeeper/app/templates/index.html diff --git a/SpiderKeeper/app/__init__.py b/SpiderKeeper/app/__init__.py index b78a267a..13fed546 100644 --- a/SpiderKeeper/app/__init__.py +++ b/SpiderKeeper/app/__init__.py @@ -106,8 +106,9 @@ def regist_server(): # start sync job status scheduler from SpiderKeeper.app.schedulers.common import sync_job_execution_status_job, sync_spiders, \ - reload_runnable_spider_job_execution + reload_runnable_spider_job_execution, sync_projects +scheduler.add_job(sync_projects, 'interval', seconds=10, id='sys_sync_projects') scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status') scheduler.add_job(sync_spiders, 'interval', seconds=10, id='sys_sync_spiders') scheduler.add_job(reload_runnable_spider_job_execution, 'interval', seconds=30, id='sys_reload_job') @@ -119,11 +120,13 @@ def start_scheduler(): def init_basic_auth(): if not app.config.get('NO_AUTH'): - basic_auth = BasicAuth(app) + BasicAuth(app) def initialize(): init_database() regist_server() + sync_projects() + sync_spiders() start_scheduler() init_basic_auth() diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index 953d1526..61ad920c 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -71,7 +71,7 @@ def server(self): return self._server -class SpiderAgent(): +class SpiderAgent(object): def __init__(self): self.spider_service_instances = [] @@ -82,7 +82,7 @@ def regist(self, spider_service_proxy): def get_project_list(self): project_list = self.spider_service_instances[0].get_project_list() Project.load_project(project_list) - return [project.to_dict() for project in Project.query.all()] + return project_list def delete_project(self, project): for spider_service_instance in self.spider_service_instances: diff --git a/SpiderKeeper/app/schedulers/common.py b/SpiderKeeper/app/schedulers/common.py index e4f9a537..fb6b6bf8 100644 --- a/SpiderKeeper/app/schedulers/common.py +++ b/SpiderKeeper/app/schedulers/common.py @@ -4,6 +4,15 @@ from SpiderKeeper.app.spider.model import Project, JobInstance, SpiderInstance +def sync_projects(): + """ + sync projects + :return: + """ + agent.get_project_list() + app.logger.debug('[sync_projects]') + + def sync_job_execution_status_job(): """ sync job execution running status diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 712f8d88..b819b48e 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -5,16 +5,18 @@ import flask_restful import requests from flask import Blueprint, request -from flask import abort +from flask import abort, url_for from flask import flash from flask import redirect from flask import render_template from flask import session from flask_restful_swagger import swagger +from sqlalchemy import func from werkzeug.utils import secure_filename from SpiderKeeper.app import db, api, agent, app -from SpiderKeeper.app.spider.model import JobInstance, Project, JobExecution, SpiderInstance, JobRunType +from SpiderKeeper.app.spider.model import JobInstance, Project, JobExecution, SpiderInstance, \ + JobRunType api_spider_bp = Blueprint('spider', __name__) @@ -430,13 +432,6 @@ def put(self, project_id, job_exec_id): ''' -@app.before_request -def intercept_no_project(): - if request.path.find('/project//') > -1: - flash("create project first") - return redirect("/project/manage", code=302) - - @app.context_processor def inject_common(): return dict(now=datetime.datetime.now(), @@ -446,14 +441,22 @@ def inject_common(): @app.context_processor def inject_project(): project_context = {} - project_context['project_list'] = Project.query.all() - if project_context['project_list'] and (not session.get('project_id')): - project = Project.query.first() + project = None + projects = Project.query.all() + project_context['project_list'] = projects + if projects: + project = projects[0] + + project_id = session.get('project_id') + if isinstance(project_id, int): + project = Project.query.get(project_id) or project + + if project: session['project_id'] = project.id - if session.get('project_id'): - project_context['project'] = Project.query.get(session['project_id']) - project_context['spider_list'] = [spider_instance.to_dict() for spider_instance in - SpiderInstance.query.filter_by(project_id=session['project_id']).all()] + project_context['project'] = project + project_context['spider_list'] = [ + spider_instance.to_dict() for spider_instance in + SpiderInstance.query.filter_by(project_id=project.id).all()] else: project_context['project'] = {} return project_context @@ -462,13 +465,13 @@ def inject_project(): @app.context_processor def utility_processor(): def timedelta(end_time, start_time): - ''' + """ :param end_time: :param start_time: :param unit: s m h :return: - ''' + """ if not end_time or not start_time: return '' if type(end_time) == str: @@ -495,11 +498,12 @@ def index(): project = Project.query.first() if project: return redirect("/project/%s/job/dashboard" % project.id, code=302) - return redirect("/project/manage", code=302) + return render_template("index.html") -@app.route("/project/") +@app.route("/project/") def project_index(project_id): + Project.query.get_or_404(project_id) session['project_id'] = project_id return redirect("/project/%s/job/dashboard" % project_id, code=302) @@ -514,9 +518,9 @@ def project_create(): return redirect("/project/%s/spider/deploy" % project.id, code=302) -@app.route("/project//delete") +@app.route("/project//delete") def project_delete(project_id): - project = Project.query.get(project_id) + project = Project.query.get_or_404(project_id) agent.delete_project(project) db.session.delete(project) db.session.commit() @@ -528,21 +532,26 @@ def project_manage(): return render_template("project_manage.html") -@app.route("/project//job/dashboard") +@app.route("/project//job/dashboard") def job_dashboard(project_id): + Project.query.get_or_404(project_id) + session['project_id'] = project_id return render_template("job_dashboard.html", job_status=JobExecution.list_jobs(project_id)) -@app.route("/project//job/periodic") +@app.route("/project//job/periodic") def job_periodic(project_id): + Project.query.get_or_404(project_id) + session['project_id'] = project_id job_instance_list = [job_instance.to_dict() for job_instance in JobInstance.query.filter_by(run_type="periodic", project_id=project_id).all()] return render_template("job_periodic.html", job_instance_list=job_instance_list) -@app.route("/project//job/add", methods=['post']) +@app.route("/project//job/add", methods=['post']) def job_add(project_id): + Project.query.get_or_404(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id @@ -576,38 +585,38 @@ def job_add(project_id): return redirect(request.referrer, code=302) -@app.route("/project//jobexecs//stop") +@app.route("/project//jobexecs//stop") def job_stop(project_id, job_exec_id): - job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first() + job_execution = JobExecution.query.get_or_404(job_exec_id) agent.cancel_spider(job_execution) return redirect(request.referrer, code=302) -@app.route("/project//jobexecs//log") +@app.route("/project//jobexecs//log") def job_log(project_id, job_exec_id): - job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first() + job_execution = JobExecution.query.get_or_404(job_exec_id) res = requests.get(agent.log_url(job_execution)) res.encoding = 'utf8' raw = res.text return render_template("job_log.html", log_lines=raw.split('\n')) -@app.route("/project//job//run") +@app.route("/project//job//run") def job_run(project_id, job_instance_id): - job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first() + job_instance = JobInstance.query.get_or_404(job_instance_id) agent.start_spider(job_instance) return redirect(request.referrer, code=302) -@app.route("/job//remove") +@app.route("/job//remove") def job_remove(job_instance_id): - job_instance = JobInstance.query.get(job_instance_id) + job_instance = JobInstance.query.get_or_404(job_instance_id) db.session.delete(job_instance) db.session.commit() return redirect(request.referrer, code=302) -@app.route("/project//jobs/remove") +@app.route("/project//jobs/remove") def jobs_remove(project_id): for job_instance in JobInstance.query.filter_by(project_id=project_id): db.session.delete(job_instance) @@ -615,27 +624,57 @@ def jobs_remove(project_id): return redirect(request.referrer, code=302) -@app.route("/project//job//switch") +@app.route("/project//job//switch") def job_switch(project_id, job_instance_id): - job_instance = JobInstance.query.filter_by(project_id=project_id, id=job_instance_id).first() + job_instance = JobInstance.query.get_or_404(job_instance_id) job_instance.enabled = -1 if job_instance.enabled == 0 else 0 db.session.commit() return redirect(request.referrer, code=302) -@app.route("/project//spider/dashboard") +@app.route("/project//spider/dashboard") def spider_dashboard(project_id): - spider_instance_list = SpiderInstance.list_spiders(project_id) - return render_template("spider_dashboard.html", - spider_instance_list=spider_instance_list) - - -@app.route("/project//spider/deploy") + Project.query.get_or_404(project_id) + session['project_id'] = project_id + last_runtime_query = db.session.query( + SpiderInstance.spider_name, + func.Max(JobExecution.date_created).label('last_runtime'), + ).outerjoin(JobInstance, JobInstance.spider_name == SpiderInstance.spider_name)\ + .outerjoin(JobExecution).filter(SpiderInstance.project_id == project_id)\ + .group_by(SpiderInstance.id) + + last_runtime = dict( + (spider_name, last_runtime) for spider_name, last_runtime in last_runtime_query + ) + + avg_runtime_query = db.session.query( + SpiderInstance.spider_name, + func.Avg(JobExecution.end_time - JobExecution.start_time).label('avg_runtime'), + ).outerjoin(JobInstance, JobInstance.spider_name == SpiderInstance.spider_name)\ + .outerjoin(JobExecution).filter(SpiderInstance.project_id == project_id)\ + .filter(JobExecution.end_time != None)\ + .group_by(SpiderInstance.id) + + avg_runtime = dict( + (spider_name, avg_runtime) for spider_name, avg_runtime in avg_runtime_query + ) + + spiders = [] + for spider in SpiderInstance.query.filter(SpiderInstance.project_id == project_id).all(): + spider.last_runtime = last_runtime.get(spider.spider_name) + spider.avg_runtime = avg_runtime.get(spider.spider_name) + spiders.append(spider) + return render_template("spider_dashboard.html", spiders=spiders) + + +@app.route("/project//spider/deploy") def spider_deploy(project_id): + Project.query.get_or_404(project_id) + session['project_id'] = project_id return render_template("spider_deploy.html") -@app.route("/project//spider/upload", methods=['post']) +@app.route("/project//spider/upload", methods=['post']) def spider_egg_upload(project_id): project = Project.query.get(project_id) if 'file' not in request.files: @@ -656,13 +695,17 @@ def spider_egg_upload(project_id): return redirect(request.referrer) -@app.route("/project//project/stats") +@app.route("/project//project/stats") def project_stats(project_id): + Project.query.get_or_404(project_id) + session['project_id'] = project_id run_stats = JobExecution.list_run_stats_by_hours(project_id) return render_template("project_stats.html", run_stats=run_stats) -@app.route("/project//server/stats") +@app.route("/project//server/stats") def service_stats(project_id): + Project.query.get_or_404(project_id) + session['project_id'] = project_id run_stats = JobExecution.list_run_stats_by_hours(project_id) return render_template("server_stats.html", run_stats=run_stats) diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 8fce5f79..4003e46e 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -60,35 +60,6 @@ def to_dict(self): spider_name=self.spider_name, project_id=self.project_id) - @classmethod - def list_spiders(cls, project_id): - sql_last_runtime = ''' - select * from (select a.spider_name,b.date_created from sk_job_instance as a - left join sk_job_execution as b - on a.id = b.job_instance_id - order by b.date_created desc) as c - group by c.spider_name - ''' - sql_avg_runtime = ''' - select a.spider_name,avg(end_time-start_time) from sk_job_instance as a - left join sk_job_execution as b - on a.id = b.job_instance_id - where b.end_time is not null - group by a.spider_name - ''' - last_runtime_list = dict( - (spider_name, last_run_time) for spider_name, last_run_time in db.engine.execute(sql_last_runtime)) - avg_runtime_list = dict( - (spider_name, avg_run_time) for spider_name, avg_run_time in db.engine.execute(sql_avg_runtime)) - res = [] - for spider in cls.query.filter_by(project_id=project_id).all(): - last_runtime = last_runtime_list.get(spider.spider_name) - res.append(dict(spider.to_dict(), - **{'spider_last_runtime': last_runtime if last_runtime else '-', - 'spider_avg_runtime': avg_runtime_list.get(spider.spider_name) - })) - return res - class JobPriority: LOW, NORMAL, HIGH, HIGHEST = range(-1, 3) @@ -103,17 +74,17 @@ class JobInstance(Base): __tablename__ = 'sk_job_instance' spider_name = db.Column(db.String(100), nullable=False, index=True) - project_id = db.Column(db.INTEGER, nullable=False, index=True) + project_id = db.Column(db.Integer, nullable=False, index=True) tags = db.Column(db.Text) # job tag(split by , ) spider_arguments = db.Column(db.Text) # job execute arguments(split by , ex.: arg1=foo,arg2=bar) - priority = db.Column(db.INTEGER) + priority = db.Column(db.Integer) desc = db.Column(db.Text) cron_minutes = db.Column(db.String(20), default="0") cron_hour = db.Column(db.String(20), default="*") cron_day_of_month = db.Column(db.String(20), default="*") cron_day_of_week = db.Column(db.String(20), default="*") cron_month = db.Column(db.String(20), default="*") - enabled = db.Column(db.INTEGER, default=0) # 0/-1 + enabled = db.Column(db.Integer, default=0) # 0/-1 run_type = db.Column(db.String(20)) # periodic/onetime def to_dict(self): diff --git a/SpiderKeeper/app/templates/base.html b/SpiderKeeper/app/templates/base.html index 5f242b17..3b719390 100644 --- a/SpiderKeeper/app/templates/base.html +++ b/SpiderKeeper/app/templates/base.html @@ -58,11 +58,8 @@ @@ -73,6 +70,7 @@ + {% block main_sidebar %} + {% endblock %} @@ -130,35 +129,8 @@ - SpiderKeeper. + SpiderKeeper. - -
diff --git a/SpiderKeeper/app/templates/index.html b/SpiderKeeper/app/templates/index.html new file mode 100644 index 00000000..ea49515a --- /dev/null +++ b/SpiderKeeper/app/templates/index.html @@ -0,0 +1,12 @@ +{% extends "base.html" %} +{% block main_sidebar %} +{% endblock %} +{% block content_header %} +
+
    +
  • No one project on scrapyd found. Please upload it and refresh the page after a while

  • +
+
+{% endblock %} +{% block content_body %} +{% endblock %} \ No newline at end of file diff --git a/SpiderKeeper/app/templates/project_manage.html b/SpiderKeeper/app/templates/project_manage.html index a21dfc01..4630c392 100644 --- a/SpiderKeeper/app/templates/project_manage.html +++ b/SpiderKeeper/app/templates/project_manage.html @@ -24,11 +24,7 @@

{{ project.project_name }}

diff --git a/SpiderKeeper/app/templates/spider_dashboard.html b/SpiderKeeper/app/templates/spider_dashboard.html index 5092edd2..a0068d46 100644 --- a/SpiderKeeper/app/templates/spider_dashboard.html +++ b/SpiderKeeper/app/templates/spider_dashboard.html @@ -15,12 +15,12 @@

Periodic jobs (Spiders)

Last Runtime Avg Runtime - {% for spider_instance in spider_instance_list %} + {% for spider in spiders %} - {{ spider_instance.spider_instance_id }} - {{ spider_instance.spider_name }} - {{ spider_instance.spider_last_runtime }} - {{ readable_time(spider_instance.spider_avg_runtime) }} + {{ spider.id }} + {{ spider.spider_name }} + {{ spider.last_runtime if spider.last_runtime else '-' }} + {{ readable_time(spider.avg_runtime) }} {% endfor %} From 58c3214452d2623741c7d20dc70ab9fad8ad7dc0 Mon Sep 17 00:00:00 2001 From: kalombo Date: Wed, 18 Oct 2017 17:21:51 +0500 Subject: [PATCH 09/37] fix avg runtime, added cascade deletion --- SpiderKeeper/app/spider/controller.py | 2 ++ SpiderKeeper/app/spider/model.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index b819b48e..697c3ce0 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -663,6 +663,8 @@ def spider_dashboard(project_id): for spider in SpiderInstance.query.filter(SpiderInstance.project_id == project_id).all(): spider.last_runtime = last_runtime.get(spider.spider_name) spider.avg_runtime = avg_runtime.get(spider.spider_name) + if spider.avg_runtime is not None: + spider.avg_runtime = spider.avg_runtime.total_seconds() spiders.append(spider) return render_template("spider_dashboard.html", spiders=spiders) diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 4003e46e..960b66d1 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -119,8 +119,10 @@ class JobExecution(Base): project_id = db.Column(db.Integer, nullable=False, index=True) service_job_execution_id = db.Column(db.String(50), nullable=False, index=True) - job_instance_id = db.Column(db.Integer, - db.ForeignKey('sk_job_instance.id'), nullable=False, index=True) + job_instance_id = db.Column( + db.Integer, db.ForeignKey('sk_job_instance.id', ondelete='CASCADE'), nullable=False, + index=True + ) job_instance = relation(JobInstance) create_time = db.Column(db.DateTime) start_time = db.Column(db.DateTime) From c10d7298615be216aba7f43e954cceb71a51a02c Mon Sep 17 00:00:00 2001 From: kalombo Date: Wed, 18 Oct 2017 17:22:28 +0500 Subject: [PATCH 10/37] =?UTF-8?q?Bump=20version:=200.0.2=20=E2=86=92=200.1?= =?UTF-8?q?.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e650aa88..9c22c4e8 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.2 +current_version = 0.1.0 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index 84b7fd00..dd9017f7 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.0.2' +__version__ = '0.1.0' __author__ = 'kalombo' From 79cdc80706f52c683be1d5242fe2db8c1c0252f5 Mon Sep 17 00:00:00 2001 From: kalombo Date: Fri, 20 Oct 2017 16:25:45 +0500 Subject: [PATCH 11/37] remove logger, added foreign keys --- SpiderKeeper/app/__init__.py | 19 ++----------------- SpiderKeeper/app/spider/model.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/SpiderKeeper/app/__init__.py b/SpiderKeeper/app/__init__.py index 13fed546..d7bd4340 100644 --- a/SpiderKeeper/app/__init__.py +++ b/SpiderKeeper/app/__init__.py @@ -1,5 +1,4 @@ # Import flask and template operators -import logging import traceback from apscheduler.schedulers.background import BackgroundScheduler @@ -20,15 +19,6 @@ app.config.from_object(config) app.jinja_env.globals['sk_version'] = SpiderKeeper.__version__ -# Logging -log = logging.getLogger('werkzeug') -log.setLevel(logging.ERROR) -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') -handler = logging.StreamHandler() -handler.setFormatter(formatter) -app.logger.setLevel(app.config.get('LOG_LEVEL', "INFO")) -app.logger.addHandler(handler) - # swagger api = swagger.docs(Api(app), apiVersion=SpiderKeeper.__version__, api_spec_url="/api", description='SpiderKeeper') @@ -57,12 +47,6 @@ class Base(db.Model): onupdate=db.func.current_timestamp()) -# Sample HTTP error handling -# @app.errorhandler(404) -# def not_found(error): -# abort(404) - - @app.errorhandler(Exception) def handle_error(e): code = 500 @@ -111,7 +95,8 @@ def regist_server(): scheduler.add_job(sync_projects, 'interval', seconds=10, id='sys_sync_projects') scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status') scheduler.add_job(sync_spiders, 'interval', seconds=10, id='sys_sync_spiders') -scheduler.add_job(reload_runnable_spider_job_execution, 'interval', seconds=30, id='sys_reload_job') +scheduler.add_job(reload_runnable_spider_job_execution, 'interval', seconds=30, + id='sys_reload_job') def start_scheduler(): diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 960b66d1..3cb81e48 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -30,7 +30,12 @@ class SpiderInstance(Base): __tablename__ = 'sk_spider' spider_name = db.Column(db.String(100)) - project_id = db.Column(db.Integer, nullable=False, index=True) + + project_id = db.Column( + db.Integer, db.ForeignKey('sk_project.id', ondelete='CASCADE'), nullable=False, + index=True + ) + project = relation(Project) @classmethod def update_spider_instances(cls, project_id, spider_instance_list): @@ -74,7 +79,12 @@ class JobInstance(Base): __tablename__ = 'sk_job_instance' spider_name = db.Column(db.String(100), nullable=False, index=True) - project_id = db.Column(db.Integer, nullable=False, index=True) + project_id = db.Column( + db.Integer, db.ForeignKey('sk_project.id', ondelete='CASCADE'), nullable=False, + index=True + ) + project = relation(Project) + tags = db.Column(db.Text) # job tag(split by , ) spider_arguments = db.Column(db.Text) # job execute arguments(split by , ex.: arg1=foo,arg2=bar) priority = db.Column(db.Integer) @@ -117,6 +127,7 @@ class SpiderStatus: class JobExecution(Base): __tablename__ = 'sk_job_execution' + # Useless field, that should be removed project_id = db.Column(db.Integer, nullable=False, index=True) service_job_execution_id = db.Column(db.String(50), nullable=False, index=True) job_instance_id = db.Column( From 771632d7fe79f13ddc5650b88ffa64859ed28f6f Mon Sep 17 00:00:00 2001 From: kalombo Date: Fri, 20 Oct 2017 16:29:02 +0500 Subject: [PATCH 12/37] =?UTF-8?q?Bump=20version:=200.1.0=20=E2=86=92=200.1?= =?UTF-8?q?.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 9c22c4e8..b5e0ed6e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0 +current_version = 0.1.1 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index dd9017f7..b98caef3 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.1.0' +__version__ = '0.1.1' __author__ = 'kalombo' From 96359883dc189ff0f33bd47d611ee95d0ab22171 Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 23 Oct 2017 16:16:24 +0500 Subject: [PATCH 13/37] separate scheduler into own process, remove create project view, fixes --- SpiderKeeper/app/__init__.py | 18 ------------------ SpiderKeeper/app/schedulers/__init__.py | 0 SpiderKeeper/app/spider/controller.py | 18 +++++------------- SpiderKeeper/app/spider/model.py | 1 + SpiderKeeper/app/templates/base.html | 2 +- SpiderKeeper/app/templates/job_dashboard.html | 6 +++--- SpiderKeeper/scheduler/__init__.py | 2 ++ .../schedulers/common.py => scheduler/jobs.py} | 13 ++++++++++++- 8 files changed, 24 insertions(+), 36 deletions(-) delete mode 100644 SpiderKeeper/app/schedulers/__init__.py create mode 100644 SpiderKeeper/scheduler/__init__.py rename SpiderKeeper/{app/schedulers/common.py => scheduler/jobs.py} (85%) diff --git a/SpiderKeeper/app/__init__.py b/SpiderKeeper/app/__init__.py index d7bd4340..73d1b248 100644 --- a/SpiderKeeper/app/__init__.py +++ b/SpiderKeeper/app/__init__.py @@ -34,9 +34,6 @@ def teardown_request(exception): db.session.remove() db.session.remove() -# Define apscheduler -scheduler = BackgroundScheduler() - class Base(db.Model): __abstract__ = True @@ -89,18 +86,6 @@ def regist_server(): app.register_blueprint(api_spider_bp) # start sync job status scheduler -from SpiderKeeper.app.schedulers.common import sync_job_execution_status_job, sync_spiders, \ - reload_runnable_spider_job_execution, sync_projects - -scheduler.add_job(sync_projects, 'interval', seconds=10, id='sys_sync_projects') -scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status') -scheduler.add_job(sync_spiders, 'interval', seconds=10, id='sys_sync_spiders') -scheduler.add_job(reload_runnable_spider_job_execution, 'interval', seconds=30, - id='sys_reload_job') - - -def start_scheduler(): - scheduler.start() def init_basic_auth(): @@ -111,7 +96,4 @@ def init_basic_auth(): def initialize(): init_database() regist_server() - sync_projects() - sync_spiders() - start_scheduler() init_basic_auth() diff --git a/SpiderKeeper/app/schedulers/__init__.py b/SpiderKeeper/app/schedulers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 697c3ce0..1771a431 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -508,27 +508,19 @@ def project_index(project_id): return redirect("/project/%s/job/dashboard" % project_id, code=302) -@app.route("/project/create", methods=['post']) -def project_create(): - project_name = request.form['project_name'] - project = Project() - project.project_name = project_name - db.session.add(project) - db.session.commit() - return redirect("/project/%s/spider/deploy" % project.id, code=302) - - @app.route("/project//delete") def project_delete(project_id): project = Project.query.get_or_404(project_id) agent.delete_project(project) db.session.delete(project) db.session.commit() - return redirect("/project/manage", code=302) + return redirect(url_for('index')) -@app.route("/project/manage") -def project_manage(): +@app.route("/project//manage") +def project_manage(project_id): + Project.query.get_or_404(project_id) + session['project_id'] = project_id return render_template("project_manage.html") diff --git a/SpiderKeeper/app/spider/model.py b/SpiderKeeper/app/spider/model.py index 3cb81e48..21c22f65 100644 --- a/SpiderKeeper/app/spider/model.py +++ b/SpiderKeeper/app/spider/model.py @@ -100,6 +100,7 @@ class JobInstance(Base): def to_dict(self): return dict( job_instance_id=self.id, + project_id=self.project_id, spider_name=self.spider_name, tags=self.tags.split(',') if self.tags else None, spider_arguments=self.spider_arguments, diff --git a/SpiderKeeper/app/templates/base.html b/SpiderKeeper/app/templates/base.html index 3b719390..dab12222 100644 --- a/SpiderKeeper/app/templates/base.html +++ b/SpiderKeeper/app/templates/base.html @@ -98,7 +98,7 @@
  • Running Stats
  • -
  • Manage
  • +
  • Manage
  • SERVER
  • Usage Stats
  • diff --git a/SpiderKeeper/app/templates/job_dashboard.html b/SpiderKeeper/app/templates/job_dashboard.html index 8e4c4a9f..9874f852 100644 --- a/SpiderKeeper/app/templates/job_dashboard.html +++ b/SpiderKeeper/app/templates/job_dashboard.html @@ -38,7 +38,7 @@

    Next Jobs

    {% if job.job_instance %} {{ job.job_execution_id }} - {{ job.job_instance_id }} + {{ job.job_instance_id }} {{ job.job_instance.spider_name }} {{ job.job_instance.spider_arguments }} @@ -95,7 +95,7 @@

    Running Jobs

    {% if job.job_instance %} {{ job.job_execution_id }} - {{ job.job_instance_id }} + {{ job.job_instance_id }} {{ job.job_instance.spider_name }} {{ job.job_instance.spider_arguments }} @@ -163,7 +163,7 @@

    Completed Jobs

    {% if job.job_instance %} {{ job.job_execution_id }} - {{ job.job_instance_id }} + {{ job.job_instance_id }} {{ job.job_instance.spider_name }} {{ job.job_instance.spider_arguments }} diff --git a/SpiderKeeper/scheduler/__init__.py b/SpiderKeeper/scheduler/__init__.py new file mode 100644 index 00000000..633f8661 --- /dev/null +++ b/SpiderKeeper/scheduler/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- + diff --git a/SpiderKeeper/app/schedulers/common.py b/SpiderKeeper/scheduler/jobs.py similarity index 85% rename from SpiderKeeper/app/schedulers/common.py rename to SpiderKeeper/scheduler/jobs.py index fb6b6bf8..9e63e54b 100644 --- a/SpiderKeeper/app/schedulers/common.py +++ b/SpiderKeeper/scheduler/jobs.py @@ -1,9 +1,13 @@ import time -from SpiderKeeper.app import scheduler, app, agent +from apscheduler.schedulers.background import BlockingScheduler +from SpiderKeeper.app import app, agent from SpiderKeeper.app.spider.model import Project, JobInstance, SpiderInstance +scheduler = BlockingScheduler() + + def sync_projects(): """ sync projects @@ -87,3 +91,10 @@ def reload_runnable_spider_job_execution(): running_job_ids.difference(available_job_ids)): scheduler.remove_job(invalid_job_id) app.logger.info('[drop_spider_job][job_id:%s]' % invalid_job_id) + + +scheduler.add_job(sync_projects, 'interval', seconds=10, id='sys_sync_projects') +scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status') +scheduler.add_job(sync_spiders, 'interval', seconds=10, id='sys_sync_spiders') +scheduler.add_job(reload_runnable_spider_job_execution, 'interval', seconds=30, + id='sys_reload_job') From de986d6a79d86f38fd2fc68dab777606757d9a89 Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 23 Oct 2017 16:17:20 +0500 Subject: [PATCH 14/37] =?UTF-8?q?Bump=20version:=200.1.1=20=E2=86=92=200.1?= =?UTF-8?q?.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index b5e0ed6e..f7b3578c 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.1 +current_version = 0.1.2 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index b98caef3..c9553577 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.1.1' +__version__ = '0.1.2' __author__ = 'kalombo' From f7a2ebd43621ba62a021aac91e2fa40b6b5954a7 Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 23 Oct 2017 17:14:12 +0500 Subject: [PATCH 15/37] remove log styles, refactoring --- SpiderKeeper/app/spider/controller.py | 13 +++++-------- SpiderKeeper/app/templates/job_dashboard.html | 4 ++-- SpiderKeeper/app/templates/job_log.html | 16 ---------------- SpiderKeeper/app/templates/job_periodic.html | 2 +- 4 files changed, 8 insertions(+), 27 deletions(-) delete mode 100644 SpiderKeeper/app/templates/job_log.html diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 1771a431..4b0d16fc 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -584,17 +584,14 @@ def job_stop(project_id, job_exec_id): return redirect(request.referrer, code=302) -@app.route("/project//jobexecs//log") -def job_log(project_id, job_exec_id): +@app.route("/project/jobexecs//log") +def job_log(job_exec_id): job_execution = JobExecution.query.get_or_404(job_exec_id) - res = requests.get(agent.log_url(job_execution)) - res.encoding = 'utf8' - raw = res.text - return render_template("job_log.html", log_lines=raw.split('\n')) + return redirect(agent.log_url(job_execution)) -@app.route("/project//job//run") -def job_run(project_id, job_instance_id): +@app.route("/project/job//run") +def job_run(job_instance_id): job_instance = JobInstance.query.get_or_404(job_instance_id) agent.start_spider(job_instance) return redirect(request.referrer, code=302) diff --git a/SpiderKeeper/app/templates/job_dashboard.html b/SpiderKeeper/app/templates/job_dashboard.html index 9874f852..c2b2217f 100644 --- a/SpiderKeeper/app/templates/job_dashboard.html +++ b/SpiderKeeper/app/templates/job_dashboard.html @@ -119,7 +119,7 @@

    Running Jobs

    {% endif %} {{ timedelta(now,job.start_time) }} {{ job.start_time }} - Log {{ job.running_on }} @@ -190,7 +190,7 @@

    Completed Jobs

    {{ job.items_count }} {{ job.warnings_count }} {{ job.errors_count }} - Log {% if job.running_status == 2 %} diff --git a/SpiderKeeper/app/templates/job_log.html b/SpiderKeeper/app/templates/job_log.html deleted file mode 100644 index a130775b..00000000 --- a/SpiderKeeper/app/templates/job_log.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - -{% for line in log_lines %} -

    {{ line }}

    -{% endfor %} - - \ No newline at end of file diff --git a/SpiderKeeper/app/templates/job_periodic.html b/SpiderKeeper/app/templates/job_periodic.html index d6d1d6d0..41675f9b 100644 --- a/SpiderKeeper/app/templates/job_periodic.html +++ b/SpiderKeeper/app/templates/job_periodic.html @@ -80,7 +80,7 @@

    Periodic jobs (Spiders)

    {% endif %} - Run Remove From f9de501f8724a6caaa08882ac6b8477f77d98376 Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 23 Oct 2017 17:14:22 +0500 Subject: [PATCH 16/37] =?UTF-8?q?Bump=20version:=200.1.2=20=E2=86=92=200.1?= =?UTF-8?q?.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index f7b3578c..64ad4521 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.2 +current_version = 0.1.3 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index c9553577..0b148eb8 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.1.2' +__version__ = '0.1.3' __author__ = 'kalombo' From 942c3c2b40084e7b4888fd2e0b3b29b23f5dd1a0 Mon Sep 17 00:00:00 2001 From: kalombo Date: Tue, 24 Oct 2017 09:21:09 +0500 Subject: [PATCH 17/37] revert logs grabbing with requests --- SpiderKeeper/app/spider/controller.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/SpiderKeeper/app/spider/controller.py b/SpiderKeeper/app/spider/controller.py index 4b0d16fc..4c859704 100644 --- a/SpiderKeeper/app/spider/controller.py +++ b/SpiderKeeper/app/spider/controller.py @@ -4,7 +4,7 @@ import flask_restful import requests -from flask import Blueprint, request +from flask import Blueprint, request, Response from flask import abort, url_for from flask import flash from flask import redirect @@ -587,7 +587,9 @@ def job_stop(project_id, job_exec_id): @app.route("/project/jobexecs//log") def job_log(job_exec_id): job_execution = JobExecution.query.get_or_404(job_exec_id) - return redirect(agent.log_url(job_execution)) + res = requests.get(agent.log_url(job_execution)) + res.encoding = 'utf8' + return Response(res.text, content_type='text/plain; charset=utf-8') @app.route("/project/job//run") From 38a4fa728ee40f869117ce4f1d6bd59f6b8e62ca Mon Sep 17 00:00:00 2001 From: kalombo Date: Tue, 24 Oct 2017 09:21:19 +0500 Subject: [PATCH 18/37] =?UTF-8?q?Bump=20version:=200.1.3=20=E2=86=92=200.1?= =?UTF-8?q?.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 64ad4521..1ea03a55 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.3 +current_version = 0.1.4 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index 0b148eb8..20e79485 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.1.3' +__version__ = '0.1.4' __author__ = 'kalombo' From a7082c9926fbcb7a67df5b996b7e48921fd1c26d Mon Sep 17 00:00:00 2001 From: kalombo Date: Thu, 26 Oct 2017 13:47:30 +0500 Subject: [PATCH 19/37] update README, fix spiderkeeper script --- CHANGELOG.md | 43 +++++++--------------------------- README.md | 28 ++++++---------------- SpiderKeeper/run.py | 19 +++++++++++---- SpiderKeeper/scheduler/jobs.py | 18 ++++++-------- SpiderKeeper/uwsgi.py | 11 --------- setup.py | 4 ++-- 6 files changed, 40 insertions(+), 83 deletions(-) delete mode 100644 SpiderKeeper/uwsgi.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c88d5327..2e0d80a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,34 +1,9 @@ -# SpiderKeeper Changelog -## 1.2.0 (2017-07-24) -- support chose server manually -- support set cron exp manually -- fix log Chinese decode problem -- fix scheduler trigger not fire problem -- fix not delete project on scrapyd problem - -## 1.1.0 (2017-04-25) -- support basic auth -- show spider crawl time info (last_runtime,avg_runtime) -- optimized for mobile - -## 1.0.3 (2017-04-17) -- support view log - -## 1.0.0 (2017-03-30) -- refactor -- support py3 -- optimized api -- optimized scheduler -- more scalable (can support access multiply spider service) -- show running stats - -## 0.2.0 (2016-04-13) -- support view job of multi daemons. -- support run on multi daemons. -- support choice running daemon automaticaly. - -## 0.1.1 (2016-02-16) -- add status monitor(https://github.com/afaqurk/linux-dash) - -## 0.1.0 (2016-01-18) -- initial. \ No newline at end of file +# SpiderKeeper-2 Changelog +## 0.2.0 (2017-10-26) +- SpiderKeeper was forked to Spiderkeeper-2 +- Add button for removing all periodic jobs +- All tasks show stats now. +- When you run spiderkeeper under wsgi you should not use background scheduler, [see issue](https://github.com/agronholm/apscheduler/issues/160), you should run scheduler in separated process. So, scheduler was separated to own module +- Add foreign constraints to models. +- No need to create project now, all projects will be synchronized automatically with scrapyd. +- Fix bugs. \ No newline at end of file diff --git a/README.md b/README.md index 3129e6de..dd6fd2d8 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # SpiderKeeper-2 -## This is a fork of [SpiderKeeper](https://github.com/DormyMo/SpiderKeeper) +### This is a fork of [SpiderKeeper](https://github.com/DormyMo/SpiderKeeper). [Here](https://github.com/kalombos/SpiderKeeper/blob/master/CHANGELOG.md) is the changes -[![Latest Version](http://img.shields.io/pypi/v/SpiderKeeper.svg)](https://pypi.python.org/pypi/SpiderKeeper) -[![Python Versions](http://img.shields.io/pypi/pyversions/SpiderKeeper.svg)](https://pypi.python.org/pypi/SpiderKeeper) -[![The MIT License](http://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/DormyMo/SpiderKeeper/blob/master/LICENSE) +[![Latest Version](http://img.shields.io/pypi/v/SpiderKeeper-2.svg)](https://pypi.python.org/pypi/SpiderKeeper-2) +[![Python Versions](https://img.shields.io/pypi/pyversions/SpiderKeeper-2.svg)](https://pypi.python.org/pypi/SpiderKeeper-2) +![The MIT License](http://img.shields.io/badge/license-MIT-blue.svg) A scalable admin ui for spider service @@ -79,34 +79,20 @@ Visit: - api swagger: http://localhost:5000/api.html -``` - -## TODO -- [ ] Job dashboard support filter -- [x] User Authentication -- [ ] Collect & Show scrapy crawl stats -- [ ] Optimize load balancing - -## Versioning - -We use [SemVer](http://semver.org/) for versioning. For the versions available, see the [tags on this repository](https://github.com/DormyMo/SpiderKeeper/tags). +``` ## Authors - *Initial work* - [DormyMo](https://github.com/DormyMo) +- *Fork author* - [kalombo](https://github.com/kalombos/) -See also the list of [contributors](https://github.com/DormyMo/SpiderKeeper/contributors) who participated in this project. ## License -This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details +This project is licensed under the MIT License. ## Contributing Contributions are welcomed! -## 交流反馈 -![Contact](https://raw.githubusercontent.com/DormyMo/SpiderKeeper/master/screenshot/qqgroup_qrcode.png) -## 捐赠 -![Contact](https://raw.githubusercontent.com/DormyMo/SpiderKeeper/master/screenshot/donate_wechat.png) diff --git a/SpiderKeeper/run.py b/SpiderKeeper/run.py index 2f989513..ae20e3d7 100644 --- a/SpiderKeeper/run.py +++ b/SpiderKeeper/run.py @@ -2,6 +2,8 @@ from optparse import OptionParser from SpiderKeeper.app import app, initialize +from apscheduler.schedulers.background import BackgroundScheduler +from SpiderKeeper.scheduler.jobs import add_jobs def main(): @@ -17,9 +19,16 @@ def main(): if opts.verbose: app.logger.setLevel(logging.DEBUG) initialize() - app.logger.info("SpiderKeeper startd on %s:%s username:%s/password:%s with %s servers:%s" % ( - opts.host, opts.port, opts.username, opts.password, opts.server_type, ','.join(app.config.get('SERVERS', [])))) - app.run(host=opts.host, port=opts.port, use_reloader=True, threaded=True) + scheduler = BackgroundScheduler() + add_jobs(scheduler) + scheduler.start() + app.logger.info( + "SpiderKeeper startd on %s:%s username:%s/password:%s with %s servers:%s" % ( + opts.host, opts.port, opts.username, opts.password, opts.server_type, + ','.join(app.config.get('SERVERS', [])) + ) + ) + app.run(host=opts.host, port=opts.port, use_reloader=False, threaded=True) def parse_opts(config): @@ -52,7 +61,9 @@ def parse_opts(config): action='append', default=[]) parser.add_option("--database-url", - help='SpiderKeeper metadata database default: %s' % config.get('SQLALCHEMY_DATABASE_URI'), + help='SpiderKeeper metadata database default: %s' % config.get( + 'SQLALCHEMY_DATABASE_URI' + ), dest='database_url', default=config.get('SQLALCHEMY_DATABASE_URI')) diff --git a/SpiderKeeper/scheduler/jobs.py b/SpiderKeeper/scheduler/jobs.py index 9e63e54b..c48cce23 100644 --- a/SpiderKeeper/scheduler/jobs.py +++ b/SpiderKeeper/scheduler/jobs.py @@ -1,13 +1,8 @@ import time - -from apscheduler.schedulers.background import BlockingScheduler from SpiderKeeper.app import app, agent from SpiderKeeper.app.spider.model import Project, JobInstance, SpiderInstance -scheduler = BlockingScheduler() - - def sync_projects(): """ sync projects @@ -53,7 +48,7 @@ def run_spider_job(job_instance_id): app.logger.error('[run_spider_job] ' + str(e)) -def reload_runnable_spider_job_execution(): +def reload_runnable_spider_job_execution(scheduler): """ add periodic job to scheduler :return: @@ -93,8 +88,9 @@ def reload_runnable_spider_job_execution(): app.logger.info('[drop_spider_job][job_id:%s]' % invalid_job_id) -scheduler.add_job(sync_projects, 'interval', seconds=10, id='sys_sync_projects') -scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status') -scheduler.add_job(sync_spiders, 'interval', seconds=10, id='sys_sync_spiders') -scheduler.add_job(reload_runnable_spider_job_execution, 'interval', seconds=30, - id='sys_reload_job') +def add_jobs(scheduler): + scheduler.add_job(sync_projects, 'interval', seconds=10, id='sys_sync_projects') + scheduler.add_job(sync_job_execution_status_job, 'interval', seconds=5, id='sys_sync_status') + scheduler.add_job(sync_spiders, 'interval', seconds=10, id='sys_sync_spiders') + scheduler.add_job(reload_runnable_spider_job_execution, 'interval', args=[scheduler], + seconds=30, id='sys_reload_job') diff --git a/SpiderKeeper/uwsgi.py b/SpiderKeeper/uwsgi.py deleted file mode 100644 index df68c838..00000000 --- a/SpiderKeeper/uwsgi.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# @Time : 2017-09-24 14:53 -# @Author : modm -''' -you can start the server by uwsgi -like gunicorn -w 4 SpiderKeeper.uwsgi:app -''' -from SpiderKeeper.app import app, initialize - -initialize() diff --git a/setup.py b/setup.py index 900c3832..90a6cf98 100644 --- a/setup.py +++ b/setup.py @@ -9,10 +9,10 @@ version=__version__, description='Admin ui for spider service', long_description= - 'Go to https://github.com/DormyMo/SpiderKeeper/ for more information.', + 'Go to https://github.com/kalombos/SpiderKeeper/ for more information.', author=__author__, author_email='nogamemorebrain@gmail.com', - url='https://github.com/DormyMo/SpiderKeeper/', + url='https://github.com/kalombos/SpiderKeeper/', license='MIT', include_package_data=True, packages=find_packages(), From 163b6a9dd4cc06407abaf3ebbf448fac0ced4d95 Mon Sep 17 00:00:00 2001 From: kalombo Date: Thu, 26 Oct 2017 13:50:35 +0500 Subject: [PATCH 20/37] =?UTF-8?q?Bump=20version:=200.1.4=20=E2=86=92=200.2?= =?UTF-8?q?.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 1ea03a55..d503dabe 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.4 +current_version = 0.2.0 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index 20e79485..089d5e57 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.1.4' +__version__ = '0.2.0' __author__ = 'kalombo' From 193178b89a07d98ae05299470e05fbf96c02e804 Mon Sep 17 00:00:00 2001 From: kalombo Date: Wed, 28 Mar 2018 17:08:20 +0500 Subject: [PATCH 21/37] remove lost pending jobs --- SpiderKeeper/app/proxy/spiderctrl.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index 61ad920c..57d6c8ea 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -126,6 +126,18 @@ def sync_job_status(self, project): if match: job_execution.raw_stats = match[0] job_execution.process_raw_stats() + + # delete lost uncompleted jobs + all_job_ids = [] + for job_set in job_status.values(): + for job in job_set: + all_job_ids.append(job['id']) + if all_job_ids: + db.session.query(JobExecution).filter( + JobExecution.service_job_execution_id.notin_(all_job_ids) + ).filter(JobExecution.running_status != SpiderStatus.FINISHED)\ + .filter(JobExecution.running_status != SpiderStatus.CANCELED)\ + .delete(synchronize_session='fetch') # commit db.session.commit() From d6e14bdc0e3c6e46205242a85edbfd2d966431c6 Mon Sep 17 00:00:00 2001 From: kalombo Date: Wed, 28 Mar 2018 17:08:27 +0500 Subject: [PATCH 22/37] =?UTF-8?q?Bump=20version:=200.2.0=20=E2=86=92=200.2?= =?UTF-8?q?.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index d503dabe..74c453d4 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.0 +current_version = 0.2.1 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index 089d5e57..6a4c784e 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.2.0' +__version__ = '0.2.1' __author__ = 'kalombo' From cafe3c90545f9fa362bedffad5c9dd8504b6d118 Mon Sep 17 00:00:00 2001 From: kalombo Date: Wed, 28 Mar 2018 17:14:03 +0500 Subject: [PATCH 23/37] fix removing lost jobs --- SpiderKeeper/app/proxy/spiderctrl.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/SpiderKeeper/app/proxy/spiderctrl.py b/SpiderKeeper/app/proxy/spiderctrl.py index 57d6c8ea..3532437c 100644 --- a/SpiderKeeper/app/proxy/spiderctrl.py +++ b/SpiderKeeper/app/proxy/spiderctrl.py @@ -132,12 +132,11 @@ def sync_job_status(self, project): for job_set in job_status.values(): for job in job_set: all_job_ids.append(job['id']) - if all_job_ids: - db.session.query(JobExecution).filter( - JobExecution.service_job_execution_id.notin_(all_job_ids) - ).filter(JobExecution.running_status != SpiderStatus.FINISHED)\ - .filter(JobExecution.running_status != SpiderStatus.CANCELED)\ - .delete(synchronize_session='fetch') + db.session.query(JobExecution).filter( + JobExecution.service_job_execution_id.notin_(all_job_ids) + ).filter(JobExecution.running_status != SpiderStatus.FINISHED)\ + .filter(JobExecution.running_status != SpiderStatus.CANCELED)\ + .delete(synchronize_session='fetch') # commit db.session.commit() From 1be27f2b52e182c502a66f15d234258f8a5ecc75 Mon Sep 17 00:00:00 2001 From: kalombo Date: Wed, 28 Mar 2018 17:14:48 +0500 Subject: [PATCH 24/37] =?UTF-8?q?Bump=20version:=200.2.1=20=E2=86=92=200.2?= =?UTF-8?q?.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- SpiderKeeper/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 74c453d4..c43636a0 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1 +current_version = 0.2.2 files = SpiderKeeper/__init__.py commit = True diff --git a/SpiderKeeper/__init__.py b/SpiderKeeper/__init__.py index 6a4c784e..4a97f1f1 100644 --- a/SpiderKeeper/__init__.py +++ b/SpiderKeeper/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.2.1' +__version__ = '0.2.2' __author__ = 'kalombo' From 0ee028160f9f959b6e5054f77e9c415bb6ba7f9f Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 9 Apr 2018 13:20:40 +0500 Subject: [PATCH 25/37] fix readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd6fd2d8..0c1e4106 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Current Support spider service ``` -pip install spiderkeeper +pip install spiderkeeper-2 ``` ### Deployment From 3374d327e75411d120cd1037ba2247fb5a4ee42a Mon Sep 17 00:00:00 2001 From: kalombo Date: Mon, 21 May 2018 12:10:24 +0500 Subject: [PATCH 26/37] move scripts to head, fix static urls --- SpiderKeeper/app/templates/base.html | 42 ++++++++++--------- SpiderKeeper/app/templates/project_stats.html | 2 +- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/SpiderKeeper/app/templates/base.html b/SpiderKeeper/app/templates/base.html index dab12222..603a79e6 100644 --- a/SpiderKeeper/app/templates/base.html +++ b/SpiderKeeper/app/templates/base.html @@ -7,25 +7,38 @@ - + - + - + - + - + - + + + + + + + + + + + + + + @@ -135,18 +148,7 @@ - - - - - - - - - - - - + {% block script %}{% endblock %} \ No newline at end of file diff --git a/SpiderKeeper/app/templates/project_stats.html b/SpiderKeeper/app/templates/project_stats.html index 6df9831a..0d596b02 100644 --- a/SpiderKeeper/app/templates/project_stats.html +++ b/SpiderKeeper/app/templates/project_stats.html @@ -18,7 +18,7 @@

    Spider Running Stats (last 24 hours)

    {% endblock %} {% block script %} - +