What is the right way to cross asynchronously and save my results with django celery and redis and save mine?

Question

What is the right way to cross asynchronously and save my results with django celery and redis and save mine?

I am trying to understand what my problem is when I try to clear using the function created in my django application. The function goes to the website, collects the data and stores it in my database. At first I tried to use rq and redis for a while, but I kept getting an error message. So someone thought I should try and use celery, and I did it. But now I see that the problem is with rq or celery. Because I get the same error message as before. I was tired of importing it, but still got an error message, and then I thought well, maybe if I have the actual function in the tasks.py file, it will make a difference, but it is not. Here is my function that I tried to use in my tasks .py

import requests
from bs4 import BeautifulSoup
from src.blog.models import Post
import random
import re
from django.contrib.auth.models import User
import os

@app.tasks
def p_panties():
    def swappo():
        user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
        user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
        user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
        user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '

        agent_list = [user_one, user_two, user_thr, user_for]
        a = random.choice(agent_list)
        return a

    headers = {
        "user-agent": swappo(),
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "accept-encoding": "gzip,deflate,sdch",
        "accept-language": "en-US,en;q=0.8",
    }

    pan_url = 'http://www.example.org'
    shtml = requests.get(pan_url, headers=headers)
    soup = BeautifulSoup(shtml.text, 'html5lib')
    video_row = soup.find_all('div', {'class': 'post-start'})
    name = 'pan videos'

    if os.getenv('_system_name') == 'OSX':
        author = User.objects.get(id=2)
    else:
        author = User.objects.get(id=3)

    def youtube_link(url):
        youtube_page = requests.get(url, headers=headers)
        soupdata = BeautifulSoup(youtube_page.text, 'html5lib')
        video_row = soupdata.find_all('p')[0]
        entries = [{'text': div,
                    } for div in video_row]
        tubby = str(entries[0]['text'])
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tubby)
        cleaned_url = urls[0].replace('?&amp;autoplay=1', '')
        return cleaned_url

    def yt_id(code):
        the_id = code
        youtube_id = the_id.replace('https://www.youtube.com/embed/', '')
        return youtube_id

    def strip_hd(hd, move):
        str = hd
        new_hd = str.replace(move, '')
        return new_hd

    entries = [{'href': div.a.get('href'),
                'text': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
                'embed': youtube_link(div.a.get('href')), #embed
                'comments': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
                'src': 'https://i.ytimg.com/vi/' + yt_id(youtube_link(div.a.get('href'))) + '/maxresdefault.jpg', #image
                'name': name,
                'url': div.a.get('href'),
                'author': author,
                'video': True

                } for div in video_row][:13]

    for entry in entries:
        post = Post()
        post.title = entry['text']
        title = post.title
        if not Post.objects.filter(title=title):
            post.title = entry['text']
            post.name = entry['name']
            post.url = entry['url']
            post.body = entry['comments']
            post.image_url = entry['src']
            post.video_path = entry['embed']
            post.author = entry['author']
            post.video = entry['video']
            post.status = 'draft'
            post.save()
            post.tags.add("video", "Musica")
    return entries

and in python shell if i run

from tasks import *

I get

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/ray/Desktop/myheroku/practice/tasks.py", line 5, in <module>
    from src.blog.models import Post
  File "/Users/ray/Desktop/myheroku/practice/src/blog/models.py", line 3, in <module>
    from taggit.managers import TaggableManager
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/taggit/managers.py", line 7, in <module>
    from django.contrib.contenttypes.models import ContentType
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/contrib/contenttypes/models.py", line 159, in <module>
    class ContentType(models.Model):
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/contrib/contenttypes/models.py", line 160, in ContentType
    app_label = models.CharField(max_length=100)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/db/models/fields/__init__.py", line 1072, in __init__
    super(CharField, self).__init__(*args, **kwargs)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/db/models/fields/__init__.py", line 166, in __init__
    self.db_tablespace = db_tablespace or settings.DEFAULT_INDEX_TABLESPACE
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/conf/__init__.py", line 55, in __getattr__
    self._setup(name)
  File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/conf/__init__.py", line 41, in _setup
    % (desc, ENVIRONMENT_VARIABLE))
django.core.exceptions.ImproperlyConfigured: Requested setting DEFAULT_INDEX_TABLESPACE, but settings are not configured. You must either define the environment variable DJANGO_SETTINGS_MODULE or call settings.configure() before accessing settings.

, rq redis. , ,

import requests
from bs4 import BeautifulSoup
# from src.blog.models import Post
import random
import re
# from django.contrib.auth.models import User
import os

@app.task
def p_panties():
    def swappo():
        user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
        user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
        user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
        user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '

        agent_list = [user_one, user_two, user_thr, user_for]
        a = random.choice(agent_list)
        return a

    headers = {
        "user-agent": swappo(),
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "accept-encoding": "gzip,deflate,sdch",
        "accept-language": "en-US,en;q=0.8",
    }

    pan_url = 'http://www.example.org'
    shtml = requests.get(pan_url, headers=headers)
    soup = BeautifulSoup(shtml.text, 'html5lib')
    video_row = soup.find_all('div', {'class': 'post-start'})
    name = 'pan videos'

    # if os.getenv('_system_name') == 'OSX':
    #     author = User.objects.get(id=2)
    # else:
    #     author = User.objects.get(id=3)

    def youtube_link(url):
        youtube_page = requests.get(url, headers=headers)
        soupdata = BeautifulSoup(youtube_page.text, 'html5lib')
        video_row = soupdata.find_all('p')[0]
        entries = [{'text': div,
                    } for div in video_row]
        tubby = str(entries[0]['text'])
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tubby)
        cleaned_url = urls[0].replace('?&amp;autoplay=1', '')
        return cleaned_url

    def yt_id(code):
        the_id = code
        youtube_id = the_id.replace('https://www.youtube.com/embed/', '')
        return youtube_id

    def strip_hd(hd, move):
        str = hd
        new_hd = str.replace(move, '')
        return new_hd

    entries = [{'href': div.a.get('href'),
                'text': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
                'embed': youtube_link(div.a.get('href')), #embed
                'comments': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
                'src': 'https://i.ytimg.com/vi/' + yt_id(youtube_link(div.a.get('href'))) + '/maxresdefault.jpg', #image
                'name': name,
                'url': div.a.get('href'),
                # 'author': author,
                'video': True

                } for div in video_row][:13]
    #
    # for entry in entries:
    #     post = Post()
    #     post.title = entry['text']
    #     title = post.title
    #     if not Post.objects.filter(title=title):
    #         post.title = entry['text']
    #         post.name = entry['name']
    #         post.url = entry['url']
    #         post.body = entry['comments']
    #         post.image_url = entry['src']
    #         post.video_path = entry['embed']
    #         post.author = entry['author']
    #         post.video = entry['video']
    #         post.status = 'draft'
    #         post.save()
    #         post.tags.add("video", "Musica")
    return entries

,

[2016-08-13 08:31:17,222: INFO/MainProcess] Received task: tasks.p_panties[e196c6bf-2b87-4bb2-ae11-452e3c41434f]
[2016-08-13 08:31:17,238: INFO/Worker-4] Starting new HTTP connection (1): www.example.org
[2016-08-13 08:31:17,582: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:18,314: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:18,870: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:19,476: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:20,089: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:20,711: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:21,218: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:21,727: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:22,372: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:22,785: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:23,375: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:23,983: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:24,396: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:25,003: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:25,621: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:26,029: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:26,446: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:27,261: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:27,671: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:28,082: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:28,694: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:29,311: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:29,922: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:30,535: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:31,154: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:31,765: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:32,387: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:32,992: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:33,611: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:34,030: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:34,635: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:35,041: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:35,659: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:36,278: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:36,886: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:37,496: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:37,913: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:38,564: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:39,143: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:39,754: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:40,409: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:40,992: INFO/MainProcess] Task tasks.p_panties[e196c6bf-2b87-4bb2-ae11-452e3c41434f] succeeded in 23.767645187006565s: [{'src': 'https://i.ytimg.com/vi/3bU-AtShW7Y/maxresdefault.jpg', 'name': 'pan videos', 'url':...

, - Post. , . , . . , , .

environ\
  |-src\
     |-blog\
        |-migrations\
        |-static\
        |-templates\
        |-templatetags\
        |-__init__.py
        |-admin.py
        |-forms.py
        |-models
        |-tasks
        |-urls
        |-views

+4

python django queue rabbitmq django-celery

losee 13 . '16 13:06

1

SpiXel · Accepted Answer · 2016-09-11T17:46:19+0000

Django

, Python, , , Django.

, , python Django , . manage.py, manage.py , python script . DJANGO_SETTINGS_MODULE.

, , , python script, , python.

, django, ( django), manage.py - :

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myapp.settings")

Go , DEFAULT_SETTINGS_MODULE, :

os.environ.setdefault("DJANGO_SETTINGS_MODULE", DEFAULT_SETTINGS_MODULE)

script django ( env var), .

, :

import sys, os
sys.path.insert(0, "/path/to/parent/of/src") # /home/projects/my-crawler

from manage import DEFAULT_SETTINGS_MODULE
os.environ.setdefault("DJANGO_SETTINGS_MODULE", DEFAULT_SETTINGS_MODULE)

import django
django.setup() 
... The rest of your script ...

, . celery, .delay() .apply_async(), , .

- python python manage.py shell, django . .

, , , redis (, -... .., ).

.

Redis

from redis import StrictRedis

redis = StrictRedis(host='localhost', port=6379, db=0)

redis.set("scraping:tasks:results:TASK-ID-HERE", json.dumps(entries))

, Redis /.

, ,

with redis.pipeline() as pipe:
    for item in entries:
        pipe.rpush("scraping:tasks:results", json.dumps(item))
    pipe.execute()

----

, , . :

@celery_app.task
def handle_scraping_results(entries):
    you do whatever you want with the entries array now

p_panties :

handle_scraping_results.delay(entries)

RabbitMQ, p_panties, handle_scraping_results. , , , , ! . , . RabbitMQ A ( p_panties) B ( handle_result) ( RPC).

rabbitmq, redis. celery, , , . , ( ). async, . , python manage.py, .

--------- II

. , , - .

, ( )
Redis , , , django.

, , . .

, redis, , , , .

redis_keys = redis.get("scraping:tasks:results:*")

for key in redis_keys:
    value_of_redis_key = redis.get(key)
    entries = json.loads(entries)
    for entry in entries:
        post = Post()
        post.title = entry['text']
        title = post.title
        if not Post.objects.filter(title=title):
            post.title = entry['text']
            post.name = entry['name']
            post.url = entry['url']
            post.body = entry['comments']
            post.image_url = entry['src']
            post.video_path = entry['embed']
            post.author = entry['author']
            post.video = entry['video']
            post.status = 'draft'
            post.save()
            post.tags.add("video", "Musica")

What is the right way to cross asynchronously and save my results with django celery and redis and save mine?

Django

More articles: