You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
44 lines
1.0 KiB
44 lines
1.0 KiB
2 years ago
|
#!/usr/bin/env python3
|
||
|
|
||
|
import os
|
||
|
import json
|
||
|
import sys
|
||
|
from textblob import TextBlob
|
||
|
from newspaper import Article
|
||
|
import re
|
||
|
|
||
|
|
||
|
def main(params):
|
||
|
activation_id = os.environ.get('__OW_ACTIVATION_ID')
|
||
|
# params = json.loads(sys.argv[1])
|
||
|
url = params["url"]
|
||
|
article = Article(url)
|
||
|
article.download()
|
||
|
article.parse()
|
||
|
article.nlp()
|
||
|
data = article.summary
|
||
|
|
||
|
# Remove newlines and numbers in square brackets
|
||
|
data = re.sub(r'\n', ' ', data)
|
||
|
data = re.sub(r'\[\d+\]', '', data)
|
||
|
|
||
|
# Split summary into sentences based on periods
|
||
|
sentences = re.split(r'\.', data)
|
||
|
|
||
|
# Remove leading/trailing whitespaces and empty sentences
|
||
|
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
||
|
|
||
|
print(json.dumps({ "activation_id": str(activation_id),
|
||
|
"processed_data" : sentences
|
||
|
}))
|
||
|
|
||
|
return({"activation_id": str(activation_id),
|
||
|
|
||
|
"processed_data":sentences
|
||
|
})
|
||
|
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main(params)
|