你的位置:首页 > 信息动态 > 新闻中心
信息动态
联系我们

《2021博客之星年度总评选》数据采集样例程序

2022/1/1 7:04:37

《2021博客之星年度总评选》数据采集样例程序

  • pom.xml
  • 线上投票博客之星数据采集
  • 投票贡献排行榜数据采集


pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>1</groupId>
    <artifactId>_psimplemvn</artifactId>
    <version>1.0-SNAPSHOT</version>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

    <dependencies>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-chrome-driver</artifactId>
            <version>4.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.17</version>
        </dependency>
    </dependencies>


</project>

线上投票博客之星数据采集

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: BlogStarStatisticsTest
 * Author: wangyetao
 * Date: 21-12-26 23:38:10
 * Description: 线上投票博客之星数据采集
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 */
package simple.call.blogstar;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName: BlogStarStatisticsTest
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-12-26 23:38:10
 */
public class BlogStarStatisticsTest {

    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    private static String filename = "blog_star2020";
    private static String sheetname = filename.toUpperCase();
    private static String suffix = ".xlsx";
    private static FileOutputStream outputStream;
    private static ArrayList<BlogStar> blogStars;
    private static String url = "https://bss.csdn.net/m/topic/blog_star2020";//blog_star2020 url

    //测试用例
    public static void main(String[] args) throws InterruptedException {


        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        //<ul id="blogList">
        //单条数据DOM结构
        //            <li>
        //              <a target="_blank" href="https://bss.csdn.net/m/topic/blog_star2020/detail?username=qq_34361283">
        //                <span class="num">001</span>
        //                <div class="avatar">
        //                  <img src="https://profile.csdnimg.cn/8/5/9/1_qq_34361283" alt="">
        //                </div>
        //                <div class="name">✎ℳ๓₯㎕...雲淡風輕</div>
        //                <div class="level"><i class="icon-level icon-level-5"></i>码龄6年</div>
        //                <div class="statistics">
        //                  <p class="blog-num">2020年度原创博文:77 篇</p>
        //                  <p class="current-vote">当前票数: <em>392</em> 票</p>
        //                </div>
        //                <div class="footer">
        //                  <span class="vote">投TA一票</span>
        //                  <span class="pipe"></span>
        //                  <span class="canvassing">为TA拉票</span>
        //                </div>
        //              </a>
        //            </li>
        //<ul/>

        blogStars = new ArrayList<BlogStar>();

        //稍等页面渲染完成
        Thread.sleep(3000);

        List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"blogList\"]/li"));
        for (int i = 0; i < search_results.size(); i++) {
            WebElement element = search_results.get(i);
            BlogStar blogStar = new BlogStar();

            //录入时间
            blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
            //序号
            blogStar.num = element.findElement(By.className("num")).getText();
            //博客简称 name
            blogStar.name = element.findElement(By.className("name")).getText();
            //头像图片 avatarurl
            blogStar.avatarUrl = element.findElement(By.tagName("img")).getAttribute("src");
            //码龄
            blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("level")).getText())[0];
            //blog-num博客数量、current-vote当前投票数
            blogStar.intBlogNum = StringUtil.getInts(element.findElement(By.className("blog-num")).getText())[1];
            blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("current-vote")).getText())[0];
            blogStars.add(blogStar);
        }
        driver.close();

        ArrayList<String> heads = new ArrayList<String>();
        heads.add("序号");
        heads.add("博客简称");
        heads.add("小头像url");
        heads.add("码龄(年)");
        heads.add("年度原创博文数");
        heads.add("当前票数");
        heads.add("录入时间");

        //CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);

        System.out.println("Creating excel");
        try {

            XSSFWorkbook workbook = new XSSFWorkbook();
            XSSFSheet sheet = workbook.createSheet(sheetname);
            //设置列宽
            for (int i = 0; i < heads.size(); i++) {
                if (i == 0) {
                    sheet.setColumnWidth(i, 6 * 256);
                } else if (i == 6) {
                    sheet.setColumnWidth(i, 20 * 256);
                } else {
                    sheet.setColumnWidth(i, 15 * 256);
                }
            }

            Row row = null;
            Cell cell = null;
            //插入第一行数据的表头
            //创建第一行
            row = sheet.createRow(0);
            for (int i = 0; i < heads.size(); i++) {
                cell = row.createCell(i);
                cell.setCellValue(heads.get(i));
            }

            int rowNum = 1;
            int colNum = 0;

            //组合表格:行、列
            for (BlogStar blogStar : blogStars) {
                row = sheet.createRow(rowNum++);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.num);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.name);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.avatarUrl);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intlevel);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intBlogNum);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intCurrentVote);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.createTime);
                colNum = 0;
            }

            outputStream = new FileOutputStream(outPutPath + filename + suffix);
            //写入数据到Excel
            workbook.write(outputStream);
            //关闭流
            outputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("Done");

    }

}

投票贡献排行榜数据采集

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: BlogStarStatisticsVoteLeaderboardList
 * Author: wangyetao
 * Date: 21-12-27 02:43:32
 * Description: 投票贡献排行榜
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 */
package simple.call.blogstar;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName: BlogStarStatisticsVoteLeaderboardList
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-12-27 02:43:32
 */
public class BlogStarStatisticsVoteLeaderboardList {
    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    private static String filename = "aa518189.xlsx";
    private static FileOutputStream outputStream;
    private static ArrayList<BlogStar> blogVotes;
    private static String url = "https://bss.csdn.net/m/topic/blog_star2020/detail?username=aa518189";

    //测试用例
    public static void main(String[] args) throws InterruptedException {


        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        //<ul id="voteLeaderboardList">
        //单条数据DOM结构
        //            <li class="best-blogger-wrapper">
        //              <div class="left">
        //                <span class="num">1</span>
        //                <span class="text">swagLi</span>
        //                <span class="icon-level icon-level-3"></span>
        //                <span class="best-blogger"></span>
        //              </div>
        //              <div class="right">
        //                <span class="code-age">码龄4年</span>
        //                <span class="vote-num">36票</span>
        //              </div>
        //            </li>
        //<ul/>


        blogVotes = new ArrayList<BlogStar>();

        //稍等页面渲染完成
        Thread.sleep(2000);

        List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"voteLeaderboardList\"]/li"));
        for (int i = 0; i < search_results.size(); i++) {
            WebElement element = search_results.get(i);
            BlogStar blogStar = new BlogStar();

            //录入时间
            blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
            //编号
            blogStar.num = element.findElement(By.className("num")).getText();
            //博粉名称
            blogStar.name = element.findElement(By.className("text")).getText();
            //码龄(年)
            blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("code-age")).getText())[0];
            //支持票数
            blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("vote-num")).getText())[0];
            blogVotes.add(blogStar);
        }
        driver.close();

        ArrayList<String> heads = new ArrayList<String>();
        heads.add("编号");
        heads.add("博粉名称");
        heads.add("码龄(年)");
        heads.add("支持票数");
        heads.add("录入时间");

        //CSVUtils.createCSVFile(heads, blogVotes, outPutPath, filename);

        System.out.println("Creating excel");
        try {

            XSSFWorkbook workbook = new XSSFWorkbook();
            XSSFSheet sheet = workbook.createSheet("BLOGVOTES");
            //设置列宽
            for (int i = 0; i < heads.size(); i++) {
                if (i == 0) {
                    sheet.setColumnWidth(i, 6 * 256);
                } else if (i == 4) {
                    sheet.setColumnWidth(i, 20 * 256);
                } else {
                    sheet.setColumnWidth(i, 15 * 256);
                }
            }

            Row row = null;
            Cell cell = null;
            //插入第一行数据的表头
            //创建第一行
            row = sheet.createRow(0);
            for (int i = 0; i < heads.size(); i++) {
                cell = row.createCell(i);
                cell.setCellValue(heads.get(i));
            }

            int rowNum = 1;
            int colNum = 0;

            //组合表格:行、列
            for (BlogStar blogStar : blogVotes) {
                row = sheet.createRow(rowNum++);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.num);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.name);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intlevel);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intCurrentVote);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.createTime);
                colNum = 0;
            }

            outputStream = new FileOutputStream(outPutPath + filename);
            //写入数据到Excel
            workbook.write(outputStream);
            //关闭流
            outputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("Done");

    }
}