main.py and config.yaml are left out from updates, only examples are provided. Change in Readme too
This commit is contained in:
parent
4ec9f76080
commit
0808e5a42d
4 changed files with 203 additions and 1 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,5 @@
|
|||
spiders/config.yaml
|
||||
main.py
|
||||
/venv
|
||||
/spiders/pages/**
|
||||
/spiders/output/**
|
||||
|
|
14
README.md
14
README.md
|
@ -80,6 +80,20 @@ pip install -r requirements.txt
|
|||
|
||||
# Usage
|
||||
|
||||
Use it step by step. First care for the htmls of the lists of the links.
|
||||
Then care for getting the first json output from the first layer of html
|
||||
pages.
|
||||
|
||||
Copy the two examples to the file name, in which they will be
|
||||
input to the spider
|
||||
|
||||
```
|
||||
cp main.py_example main.py
|
||||
|
||||
cp spiders/config.yaml_example spiders/config.yaml
|
||||
|
||||
```
|
||||
|
||||
## Configuration File Syntax
|
||||
|
||||
The configuration file with working syntax template is
|
||||
|
|
28
main.py_example
Normal file
28
main.py_example
Normal file
|
@ -0,0 +1,28 @@
|
|||
from spiders.fdb_spider import *
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
config = "spiders/config.yaml"
|
||||
#list_of_fdbs = eval(sys.argv[1])
|
||||
#list_of_fdbs = ["giz","evergabe-online","foerderinfo.bund.de-bekanntmachungen"]
|
||||
#list_of_fdbs = ["giz","evergabe-online"]
|
||||
#list_of_fdbs = ["foerderinfo.bund.de-bekanntmachungen"]
|
||||
list_of_fdbs = ["ted.europa.eu"]
|
||||
#list_of_fdbs = ["dtvp"]
|
||||
|
||||
|
||||
# doing the crawling of government websites
|
||||
|
||||
spider = fdb_spider(config)
|
||||
|
||||
spider.download_entry_list_pages_of_funding_databases(list_of_fdbs)
|
||||
|
||||
#spider.find_config_parameter(list_of_fdbs)
|
||||
|
||||
spider.parse_entry_list_data2dictionary(list_of_fdbs)
|
||||
|
||||
#spider.download_entry_data_htmls(list_of_fdbs)
|
||||
|
||||
#spider.parse_entry_data2dictionary(list_of_fdbs)
|
||||
|
158
spiders/config.yaml_example
Normal file
158
spiders/config.yaml_example
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue