Softwareadvicescrapereviews avatar

Softwareadvicescrapereviews

Try for free

No credit card required

Go to Store
Softwareadvicescrapereviews

Softwareadvicescrapereviews

undrtkr984/softwareadvicescrapereviews
Try for free

No credit card required

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since those are the only
7# files that affect "npm install" in the next step, to speed up the build.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Input Schema",
3    "description": "To update crawler to another site, you need to change startUrls options!",
4    "type": "object",
5    "schemaVersion": 1,
6    "properties": {
7        "startUrls": {
8            "title": "Start URLs",
9            "type": "array",
10            "description": "A static list of URLs to scrape. It is recommended to only use one start url.  Just navigate to a company/product page and grab the url like so (ex: https://www.softwareadvice.com/fleet-management/gps-insight-profile/) and use that as the start URL.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
11            "prefill": [{ "url": "https://www.softwareadvice.com/fleet-management/gps-insight-profile/" }],
12            "editor": "requestListSources"
13        },
14        "maxRequestRetries": {
15            "title": "Max page retries",
16            "type": "integer",
17            "description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by <b>Page function</b>.<br><br>If set to <code>0</code>, the page will be considered failed right after the first error.",
18            "minimum": 0,
19            "prefill": 2,
20            "default": 2
21        }
22    },
23    "required": ["startUrls"]
24}

main.js

1// This is the main Node.js source code file of your actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7    // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file.
8    // For more information, see https://docs.apify.com/actors/development/input-schema
9    const input = await Apify.getInput();
10    console.log('Input:');
11    console.dir(input);
12
13    // Here you can prepare your input for actor apify/web-scraper this input is based on a actor
14    // task you used as the starting point.
15    const metamorphInput = {
16        "breakpointLocation": "NONE",
17        "browserLog": false,
18        "debugLog": false,
19        "downloadCss": true,
20        "downloadMedia": true,
21        "ignoreCorsAndCsp": false,
22        "ignoreSslErrors": false,
23        "injectJQuery": true,
24        "keepUrlFragments": false,
25        "maxRequestRetries": input.maxRequestRetries,
26        "pageFunction": // The function accepts a single argument: the "context" object.
27        // For a complete list of its properties and functions,
28        // see https://apify.com/apify/web-scraper#page-function 
29        async function pageFunction(context) {
30            // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
31            // debugger; 
32            // jQuery is handy for finding DOM elements and extracting data from them.
33            // To use it, make sure to enable the "Inject jQuery" option.
34            const $ = context.jQuery;
35        
36            let timeoutMillis; // undefined
37            await context.waitFor(1000);
38                // 2 sec timeout after the first.
39                // get elements contained the wrapper class
40            var node = document.querySelector(".wrapper")
41            var href = "";
42            // if this is not the reviews page, check to see if it has a full reviews page
43            if (!context.request.url.endsWith("/reviews/")){
44                context.log.info(context.request.url);
45            
46                for (let i= 0; i < node.childNodes.length; i++)
47                {
48                    if(node.childNodes[i].childNodes[0].innerText == 'Reviews'){
49                        
50                        try{
51                            href = node.childNodes[i].childNodes[0].href;
52                        }catch{}
53                        break;
54                    }
55                }
56                context.log.info(href)
57                // if the a tag had an href attribute, that means this has a full reviews page, so we will navigate there to get all the reviews
58                if(href != ""){
59                    await context.enqueueRequest({ url: href });
60                    return;
61                }
62            }
63        
64            await context.waitFor(1500);
65            // scrape all the reviews
66            var fullReviewData = $("[data-testid='reviews-container']");
67        
68        
69            var results = [];
70            do{
71            for (let i = 0; i < fullReviewData.length; i++ )
72            {
73                var score = "";
74                var companySize = "";
75                var industry = "";
76                var timeUsed = "";
77                var reviewSource = "";
78                var date = "";
79                var title = "";
80                var summary = "";
81                var pros = "";
82                var cons = "";
83                var date = "";
84        
85                try{
86                    score = fullReviewData[i].querySelectorAll("[data-testid='reviewers-rating'] .OverallStarRatingComponent .fullStar").length;
87                }catch(error){}
88        
89                try{
90                    companySize = fullReviewData[i].querySelector('div.review-company > p > strong').innerText;
91                }
92                catch(error){}
93        
94        
95        
96                try{
97                    industry = fullReviewData[i].querySelector('div.review-gdm-industry > p > strong').innerText
98                }
99                catch(error){
100        
101                }
102                try{
103                    timeUsed = fullReviewData[i].querySelector('div.review-profile-time-used > p > strong').innerText
104                }
105                catch(error){}
106                try{
107                    reviewSource = fullReviewData[i].querySelector('div.tooltip > p').innerText
108                }
109                catch(error){}
110                try{ 
111                    date = fullReviewData[i].querySelector("#reviews-list .review-date").innerText
112                }
113                catch(error){}
114                try{ 
115                    title = fullReviewData[i].querySelector("[data-testid='review-title']").innerText
116                }
117                catch(error){}
118                try{
119                    summary = fullReviewData[i].querySelector("[data-testid='review-summary']").innerText
120                }
121                catch(error){}
122                try{
123                    pros = fullReviewData[i].querySelector("[data-testid='review-pros']").innerText
124                }
125                catch(error){}
126                try{
127                    cons = fullReviewData[i].querySelector("[data-testid='review-cons']").innerText
128                }
129                catch(error){}
130                results.push({"score":score,"companySize":companySize,"industry":industry,"timeUsed":timeUsed,"reviewSource":reviewSource,"date":date,"title":title,"summary":summary,"pros":pros,"cons":cons});
131                
132            }
133            var button = document.getElementsByClassName("next");
134            if (button == null || button.length != 1){
135                button = null;
136            }
137            else {
138                button[0].click();
139                await context.waitFor(1500);
140            }
141            console.log("Hi");
142            }while(button != null  && results.length <= 250) // putting a limit on this so it does not time out
143            
144        
145            // Print some information to actor log
146            
147        
148            // Manually add a new page to the queue for scraping
149        
150            // Return an object with the data extracted from the page.
151            // It will be stored to the resulting dataset.
152            return {
153                results: results
154            };
155        },
156        "postNavigationHooks": `// We need to return array of (possibly async) functions here.
157            // The functions accept a single argument: the "crawlingContext" object.
158            [
159                async (crawlingContext) => {
160                    // ...
161                },
162            ]`,
163        "preNavigationHooks": `// We need to return array of (possibly async) functions here.
164            // The functions accept two arguments: the "crawlingContext" object
165            // and "gotoOptions".
166            [
167                async (crawlingContext, gotoOptions) => {
168                    // ...
169                },
170            ]`,
171        "proxyConfiguration": {
172            "useApifyProxy": true,
173            "apifyProxyCountry": "US"
174        },
175        "startUrls": input.startUrls,
176        "runMode": "PRODUCTION",
177        "useChrome": false,
178        "waitUntil": [
179            "networkidle2"
180        ]
181    };
182
183    // Now let's metamorph into actor apify/web-scraper using the created input.
184    await Apify.metamorph('apify/web-scraper', metamorphInput);
185});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^2.2.2"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"
11}
Developer
Maintained by Community

Actor Metrics

  • 1 monthly user

  • 1 star

  • >99% runs succeeded

  • Created in Jan 2023

  • Modified 2 years ago

Categories