Grabbing with Python and Grab framework


The program uses one of the sites of the Russian court site as a source of data, in code, at https://vytegorsky--vld.sudrf.ru.
It searches for the specified words in the list of assigned cases from the current day to the appointed day and sends the search result to the e-mail.

In the search_list list, the words that will be searched are assigned.
The number of days viewed, starting from the current, in which the search will occur is set DaysFuture variable.
The settings for the SMTP server and the sender and recipient mailboxes are also set. The process of running the logged code in the example_app.log file.


SearchOnTheWebsiteExample-en.py
# Search for character lines on the court's website in the 
#   section "To-do List" of the hearing days:
#   - with the sending of search results to email;
#   - with the logging of the execution process.
 
from grab import Grab
from datetime import datetime, timedelta
import smtplib
from smtplib import email
from email.mime.text import MIMEText
import logging
 
# logging setting
logger = logging.getLogger("example_app")
logger.setLevel(logging.INFO)
fh = logging.FileHandler("example_app.log")
formatter = logging.Formatter('%(asctime)s  %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)
 
# to log
logger.info("")
logger.info("Start")
 
# number of days starting from current
DaysFuture = 5
 
# column [0]: search words as string
# column [1]: list of page dates on which the search was successful
 
search_list = [ \
    [u'SEARCH_WORD_1',[]],  \
    [u'SEARCH_WORD_2',[]],  \
    ]
 
# set options for sending mail messages
 
# SMTP server, for example smtp.gmail.com
smtp_server = "smtp.gmail.com" 
port = 465  
 
# sender, for example sender@gmail.com
email_from = "SENDER"
 
# password
password = "EMAIL_PASSWORD"
 
# recipient, for example recipient@gmail.com
email_dest = 'RECIPIENT'
 
# email subject
subject = u"Info on " + datetime.now().strftime("%Y-%m-%d")
 
# to log
logger.info("Grab start")
 
g = Grab()
 
# setting grab code page and timeout 
def prepare(self):
   self.setup_grab(document_charset='windows-1251')
   self.setup_grab(timeout = 10)
 
# cycle by date
for CounterDays in range(0,DaysFuture):
 
    CounterDate = (datetime.now() + timedelta(days=CounterDays)).strftime("%d.%m.%Y")
    url = 'https://vytegorsky--vld.sudrf.ru/modules.php?name=sud_delo&srv_num=1&H_date=' + CounterDate
    counter_go = 0
    # after 10 failed attempts to get data, write to log and go to the next date
    while True:
        try:
            g.go(url)
            break
        except Exception as e:
            counter_go = counter_go + 1
            if counter_go > 10:
                logger.exception("Grab error: " + url)
                break
            continue
 
    # moving on the list of search words (column [0]), 
    #  when the word is successfully found in the text of the page, 
    #  we save the date of the page in the result column (column [1]), 
    #  as an element of the list
    for row in search_list:
        if g.doc.text_search(row[0].encode('cp1251'), byte=True):
            row[1].append(CounterDate)
 
# to log
logger.info("Grab completed")
 
# to log
logger.info("Start sending an email message")
 
# Generate Message text
email_text = ''
for row in search_list:
    if len(row[1]) > 0:
        if len(email_text) != 0:
            email_text = email_text + '\n'
        email_text =  email_text + row[0]  + ': '
        email_text_line = ''
        for one_date in row[1]:
            if len(email_text_line) != 0:
                email_text_line = email_text_line + u','
            email_text_line =  email_text_line + one_date 
        email_text = email_text + email_text_line
 
# Send an email message
m = MIMEText(email_text)
 
m['Subject'] = subject
m['From'] = email_from
m['To'] = email_dest
 
try:
    server = smtplib.SMTP_SSL(smtp_server, port)
    server.login(email_from, password)
    server.sendmail(email_from, email_dest, m.as_string())
    server.quit()
    # to log
    logger.info("Send mail message complete")
except Exception as e:
    # to log
    logger.exception("Error sending mail message")
 
# to log
logger.info("Completed")
  • en/grabbing_of_the_court_site_using_python.txt
  • Last modified: 2018/06/27 22:22
  • by 2SRTVF