Python - make script loop until condition met and use a different proxy address for each loop

Question

I'm the definition of a noob. I know next to nothing about python and am looking for help. I can read just enough code to alter variables to suit my wants/needs but I when it comes to doing something the original code didn't call for... I'm lost.

So here is the deal, I found a craigslist(CL) flagging script that originally searched ALL CL sites and flagged posts that contained a specific keyword (it was written to flag all posts that mentioned scientology).

I altered it to only search CL sites in my general area (15 sites instead of 437) but it still looks for specific keywords that have have changed. I want to automatically flag people that continuously spam CL and make it hard to sort through as I do a lot of business on CL from sorting through postings.

What I want the script to do is loop until it can no longer find posts that meet the criteria changing proxy servers after each loop. And a place inside the script where I would put in the proxy/s ip adress

I look forward to your replies.

Here is the altered code I have:

#!/usr/bin/env python
# -*- coding: utf-8 -*-


import urllib
from twill.commands import * # gives us go()

areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt', 'mendocino', 'modesto', 'monterey', 'redding', 'reno', 'sacramento', 'siskiyou', 'stockton', 'yubasutter', 'reno']

def expunge(url, area):
    page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
    page = page[page.index('<hr>'):].split('\n')[0]
    page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
        num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
        spam = 'https://post.craigslist.org/flag?flagCode=15&amppostingID='+num # url for flagging as spam
        go(spam) # flag it


print 'Checking ' + str(len(areas)) + ' areas...'

for area in ['http://' + a + '.craigslist.org/' for a in areas]:
    ujam = area + 'search/?query=james+"916+821+0590"+&catAbb=hhh'
    udre = area + 'search/?query="DRE+%23+01902542+"&catAbb=hhh'
    try:
        jam = urllib.urlopen(ujam).read()
        dre = urllib.urlopen(udre).read()
    except:
        print 'tl;dr error for ' + area

    if 'Found: ' in jam:
        print 'Found results for "James 916 821 0590" in ' + area
        expunge(ujam, area)
        print 'All "James 916 821 0590" listings marked as spam for area'

    if 'Found: ' in dre:
        print 'Found results for "DRE # 01902542" in ' + area
        expunge(udre, area)
        print 'All "DRE # 01902542" listings marked as spam for area'

If you're only using `go`, only import `go`: `from twill.commands import go` — askewchan, Feb 19 '13 at 21:17
Strange: http://twill.idyll.org/python-api.html says: `from twill.commands import go` — askewchan, Feb 19 '13 at 22:24

score 0 · Answer 1 · answered Feb 19 '13 at 21:18

0

you can create a constant loop like this:

while True:
    if condition :
        break

Itertools has a handful of tricks for iterating http://docs.python.org/2/library/itertools.html

notably, check out itertools.cycle

( these are meant as pointers in the right direction. you could craft a solution with one, the other , or even both )

answered Feb 19 '13 at 21:18

Jonathan Vanasco

15,111
10
48
72

Sorry I don't get it.. I tried to add the repeat() into the code but I keep getting Traceback (most recent call last): File "/home/quonundrum/Desktop/CL.py", line 43, in repeat('spam, 4') NameError: name 'repeat' is not defined >>> – Timothy Core Feb 19 '13 at 21:57
`import itertools as it` then call `it.repeat()` – askewchan Feb 19 '13 at 22:27
I've tried it.repeat('go, 4'), it.repeat('go(spam), 4'), it.repeat('expunge'), it.repeat('ujam').. and a whole bunch of others... it isn't repeating, but also is not giving any errors. – Timothy Core Feb 19 '13 at 22:54

askewchan · Answer 2 · 2013-02-19T23:41:43.113

I made a few changes to your code. It looks to me like the function expunge already loops through all the results in the page, so I'm not sure what loop you need to make, but there's an example of how you could check whether results are found or not at the end, but there's no loops to break from.

Don't know how to change the proxy/ip.

btw, you had 'reno' twice.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib
from twill.commands import go

areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt',
        'mendocino', 'modesto', 'monterey', 'redding', 'reno',
        'sacramento', 'siskiyou', 'stockton', 'yubasutter']
queries = ['james+"916+821+0590"','"DRE+%23+01902542"']

def expunge(url, area):
    page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
    page = page[page.index('<hr>'):].split('\n')[0]
    page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
        num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
        spam = 'https://post.craigslist.org/flag?flagCode=15&amppostingID='+num # url for flagging as spam
        go(spam) # flag it

print 'Checking ' + str(len(areas)) + ' areas...'

for area in areas:
    for query in queries:
        qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
        try:
            q = urllib.urlopen(qurl).read()
        except:
            print 'tl;dr error for {} in {}'.format(query, area)
            break

        if 'Found: ' in q:
            print 'Found results for {} in {}'.format(query, area)
            expunge(qurl, area)
            print 'All {} listings marked as spam for area'.format(query)
        elif 'Nothing found for that search' in q:
            print 'No results for {} in {}'.format(query, area)
            break
        else:
            break

Cool, that looks a lot better. Is there a way to have it continue to run until it doesn't get anymore results? — Timothy Core, Feb 19 '13 at 23:57
Do you mean you expect the results page to change while the program is running? — askewchan, Feb 20 '13 at 00:02
well in the shell it shows as things are found / flagged. So I was wondering if there was a way for the script to continue to run until there were no more results for the keywords that were searched (IE all of the results were flagged until removed). — Timothy Core, Feb 20 '13 at 00:27
Looking at the new code ... On line 17 'for u in page' the original code used ujam and udre in there for the queries and to open the flag links ... i can't see where the new code uses that. — Timothy Core, Feb 20 '13 at 18:05
inside `def expunge`? I don't see reference to `ujam` or `udre` anywhere in there, in the original or my code. — askewchan, Feb 20 '13 at 18:12
can you take a look at the new one I put up and let me know if it would work? — Timothy Core, Feb 20 '13 at 23:01

score 0 · Answer 3 · answered Feb 20 '13 at 19:11

I've made some changes... not sure how well they are working but I'm not getting any errors. Please let me know if you find any thing that is wrong / missing. - Thanks

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib, urllib2
from twill.commands import go


proxy = urllib2.ProxyHandler({'https': '108.60.219.136:8080'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
proxy2 = urllib2.ProxyHandler({'https': '198.144.186.98:3128'})
opener2 = urllib2.build_opener(proxy2)
urllib2.install_opener(opener2)
proxy3 = urllib2.ProxyHandler({'https': '66.55.153.226:8080'})
opener3 = urllib2.build_opener(proxy3)
urllib2.install_opener(opener3)
proxy4 = urllib2.ProxyHandler({'https': '173.213.113.111:8080'})
opener4 = urllib2.build_opener(proxy4)
urllib2.install_opener(opener4)
proxy5 = urllib2.ProxyHandler({'https': '198.154.114.118:3128'})
opener5 = urllib2.build_opener(proxy5)
urllib2.install_opener(opener5)


    areas = ['sfbay', 'chico', 'fresno', 'goldcountry', 'humboldt',
    'mendocino', 'modesto', 'monterey', 'redding', 'reno',
    'sacramento', 'siskiyou', 'stockton', 'yubasutter']
queries = ['james+"916+821+0590"','"DRE+%23+01902542"']

    def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
    num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
    spam = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=15&amppostingID='+num )
    spam2 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=28&amppostingID='+num )
    spam3 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=16&amppostingID='+num )
    go(spam) # flag it
    go(spam2) # flag it
    go(spam3) # flag it

print 'Checking ' + str(len(areas)) + ' areas...'

    for area in areas:
for query in queries:
    qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
    try:
        q = urllib.urlopen(qurl).read()
    except:
        print 'tl;dr error for {} in {}'.format(query, area)
        break

    if 'Found: ' in q:
        print 'Found results for {} in {}'.format(query, area)
        expunge(qurl, area)
        print 'All {} listings marked as spam for {}'.format(query, area)
        print ''
        print ''
    elif 'Nothing found for that search' in q:
        print 'No results for {} in {}'.format(query, area)
        print ''
        print ''
        break
    else:
        break

score 0 · Answer 4 · edited May 17 '14 at 18:55

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib, urllib2
from twill.commands import go


proxy = urllib2.ProxyHandler({'https': '108.60.219.136:8080'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
proxy2 = urllib2.ProxyHandler({'https': '198.144.186.98:3128'})
opener2 = urllib2.build_opener(proxy2)
urllib2.install_opener(opener2)
proxy3 = urllib2.ProxyHandler({'https': '66.55.153.226:8080'})
opener3 = urllib2.build_opener(proxy3)
urllib2.install_opener(opener3)
proxy4 = urllib2.ProxyHandler({'https': '173.213.113.111:8080'})
opener4 = urllib2.build_opener(proxy4)
urllib2.install_opener(opener4)
proxy5 = urllib2.ProxyHandler({'https': '198.154.114.118:3128'})
opener5 = urllib2.build_opener(proxy5)
urllib2.install_opener(opener5)


areas = ['capecod']
queries = ['rent','rental','home','year','falmouth','lease','credit','tenant','apartment','bedroom','bed','bath']

    def expunge(url, area):
page = urllib.urlopen(url).read() # <-- and v and vv gets you urls of ind. postings
page = page[page.index('<hr>'):].split('\n')[0]
page = [i[:i.index('">')] for i in page.split('href="')[1:-1] if '<font size="-1">' in i]

    for u in page:
    num = u[u.rfind('/')+1:u.index('.html')] # the number of the posting (like 34235235252)
    spam = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=15&amppostingID='+num )
    spam2 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=28&amppostingID='+num )
    spam3 = urllib2.urlopen('https://post.craigslist.org/flag?flagCode=16&amppostingID='+num )
    go(spam) # flag it
    go(spam2) # flag it
    go(spam3) # flag it

print 'Checking ' + str(len(areas)) + ' areas...'

    for area in areas:
for query in queries:
    qurl = 'http://' + area + '.craigslist.org/search/?query=' + query + '+&catAbb=hhh'
    try:
        q = urllib.urlopen(qurl).read()
    except:
        print 'tl;dr error for {} in {}'.format(query, area)
        break

    if 'Found: ' in q:
        print 'Found results for {} in {}'.format(query, area)
        expunge(qurl, area)
        print 'All {} listings marked as spam for {}'.format(query, area)
        print ''
        print ''
    elif 'Nothing found for that search' in q:
        print 'No results for {} in {}'.format(query, area)
        print ''
        print ''
        break
    else:
        break

Python - make script loop until condition met and use a different proxy address for each loop

4 Answers4