root/oss/amplee/amplee/examples/miror/miror.py

Revision 606, 7.1 kB (checked in by sylvain, 1 year ago)

fixed is_draft methods as per Eric Larson report

Line 
1 # -*- coding: utf-8 -*-
2
3 ## Redistribution and use in source and binary forms, with or without modification,
4 ## are permitted provided that the following conditions are met:
5 ##
6 ##      * Redistributions of source code must retain the above copyright notice,
7 ##        this list of conditions and the following disclaimer.
8 ##      * Redistributions in binary form must reproduce the above copyright notice,
9 ##        this list of conditions and the following disclaimer in the documentation
10 ##        and/or other materials provided with the distribution.
11 ##      * Neither the name of Sylvain Hellegouarch nor the names of his contributors
12 ##        may be used to endorse or promote products derived from this software
13 ##        without specific prior written permission.
14 ##
15 ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ## ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 ## WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 ## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
19 ## FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 ## DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 ## SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 ## CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 ## OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 ## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 __doc__ = """
27 This module demonstrates how you can use amplee to miror an existing AtomPub service to your local machine as well as index each entry while importing.
28
29 Basically just run the module as-is after changing the service document URI and username and password if required.
30
31 This will go and fetch the service document, lookup for each collection defined and import each member.
32
33 This module is not full proof, it may break if the service document or the atom entries are doing something funny but it might be a good starting point for anyone.
34
35 Once the import is done, you'll have a new directory called 'repository' and its sub-directories are reflecting each collection found in the service document.
36
37 You will also have a file named 'index.p' representing the indexed atom entries. You can then do something such as, from a python interpreter:
38
39 >>> from miror import setup_index
40 >>> i = setup_index()
41 >>> from datetime import datetime
42 >>> ui = i.retrieve('ui')
43 >>> r0 = ui.between(datetime(2007, 10, 24, 19, 50), datetime(2007, 10, 24, 20, 10))
44 >>> ci = i.retrieve('ci')
45 >>> r1 = ci.lookup(term='test')
46 >>> ai = i.retrieve('ai')
47 >>> r2 = ai.lookup('some name')
48 >>> r0 & r1 | r2
49 """
50
51 __author__ = ['Sylvain Hellegouarch']
52 __license__ = 'BSD'
53 __version__ = '0.1.0'
54
55 from urlparse import urlparse
56 import os
57
58 base_dir = os.getcwd()
59
60 import httplib2
61 import amara
62 from amplee.loader import AtomPubLoader
63 from amplee.atompub.member import MemberResource
64 from amplee.utils import extract_url_trail
65 from amplee.indexer import *
66
67 class ResourceWrapper(MemberResource):
68     def generate_resource_id(self, entry=None, slug=None, info=None):
69         links = entry.xml_xpath('atom:link[@rel="edit"]')
70         return extract_url_trail(links[0].href) + '.atom'
71    
72 def run(service_uri, username=None, password=None):
73     h = httplib2.Http(os.path.join(base_dir, '.cache'))
74     if username:
75         h.add_credentials(username, password)
76
77     index = setup_index()
78     service, xmldoc = load_service_document(h, service_uri)
79     import_members(h, service, index)
80
81     print "Saving service document as service.xml"
82     f = file(os.path.join(base_dir, 'service.xml'), 'w')
83     f.write(service.to_service().xml(indent=True))
84     f.close()
85
86 def setup_index():
87     index = Indexer()
88     container = ShelveContainer(os.path.join(base_dir, 'index.p'))
89     index.register(PublishedIndex('pi', container=container, granularity=DateIndex.day))
90     index.register(UpdatedIndex('ui', container=container, granularity=DateIndex.minute))
91     index.register(EditedIndex('ei', container=container, granularity=DateIndex.minute))
92     index.register(AuthorIndex('ai', container=container, index_email=True, index_uri=True))
93     index.register(CategoryIndex('ci', container=container))
94
95     return index
96    
97 def load_service_document(h, service_uri):
98     r, c = h.request(service_uri)
99     if r['status'] in ['200', '304']:
100         apl = AtomPubLoader(base_dir)
101         return apl.load(os.path.join(base_dir, 'config.xml'), c)
102
103     raise StandardError("Could not retrieve '%s'" % service_uri)
104
105 def import_members(h, service, index):
106     for collection in service.get_collections():
107         # This will allow automatic indexing of
108         # each entry
109         collection.add_indexer(index)
110        
111         uri = collection.get_base_edit_uri()
112         r, c = h.request(uri)
113        
114         if r['status'] in ['200', '304'] and \
115                r['content-type'] in ['application/atom+xml',
116                                      'application/atom+xml;type=feed']:
117
118             print "Loading: %s" % uri
119
120             # This creates the directory structure that will
121             # contain the resources in the storage
122             path_info = urlparse(uri)[2].strip('/')
123             collection.name_or_id = path_info
124             collection.store.storage.create_container(path_info)
125
126             # Load the collection feed
127             doc = amara.parse(c)
128             doc.xmlns_prefixes['app'] = "http://www.w3.org/2007/app"
129             doc.xmlns_prefixes['atom'] = "http://www.w3.org/2005/Atom"
130
131             # Extract all the link[@rel='edit'] elements
132             entries = doc.feed.xml_xpath('//atom:link[@rel="edit"]')
133
134             # Import each member
135             for entry in entries:
136                 # Instead of using the entry as it appears in the collection
137                 # feed, we load the most up-to-date representation
138                 r, c = h.request(unicode(entry.href))
139                 content_type = r['content-type'].lower().replace(' ', '')
140                 if r['status'] in ['200', '304'] and \
141                      content_type in ['application/atom+xml',
142                                       'application/atom+xml;type=entry']:
143                    
144                     print "  Entry: %s" % unicode(entry.href)
145
146                     doc = amara.parse(c)
147                     doc.xmlns_prefixes['atom'] = "http://www.w3.org/2005/Atom"
148
149                     media_content = None
150                     media = doc.entry.xml_xpath('//atom:link[@rel="edit-media"]')
151                     if media:
152                         r, c = h.request(unicode(media[0].href))
153                         if r['status'] in ['200', '304']:
154                             media_content = c
155
156                     member = ResourceWrapper(collection, 'application/atom+xml;type=entry')
157                     member.from_entry(doc.entry)
158                     collection.attach(member, member_content=member.atom.xml(),
159                                       media_content=media_content)
160                     collection.store.commit(message='Adding %s' % member.member_id)
161
162 if __name__ == "__main__":
163    run('http://snellspace.dyndns.org:9080/weblogs/services/atom', 'test', 'test')
Note: See TracBrowser for help on using the browser.