init commit

This commit is contained in:
Jan 2024-12-14 22:48:23 +01:00
commit c6f415f765
15 changed files with 97148 additions and 0 deletions

38
.gitignore vendored Normal file
View file

@ -0,0 +1,38 @@
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### IntelliJ IDEA ###
.idea/modules.xml
.idea/jarRepositories.xml
.idea/compiler.xml
.idea/libraries/
*.iws
*.iml
*.ipr
### Eclipse ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
### Mac OS ###
.DS_Store

3
.idea/.gitignore generated vendored Normal file
View file

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

7
.idea/encodings.xml generated Normal file
View file

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>

14
.idea/misc.xml generated Normal file
View file

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_23" default="true" project-jdk-name="23" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

48286
files.json Normal file

File diff suppressed because it is too large Load diff

48286
files_backup.json Normal file

File diff suppressed because it is too large Load diff

47
pom.xml Normal file
View file

@ -0,0 +1,47 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.avatic.taricdb</groupId>
<artifactId>TaricScraper</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>23</maven.compiler.source>
<maven.compiler.target>23</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.27.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.3.2</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.asynchttpclient</groupId>
<artifactId>async-http-client</artifactId>
<version>2.2.0</version>
</dependency>
</dependencies>
</project>

View file

@ -0,0 +1,4 @@
package de.avatic.taricdb.scraper;
public class Diff {
}

View file

@ -0,0 +1,100 @@
package de.avatic.taricdb.scraper;
import de.avatic.taricdb.scraper.model.File;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.asynchttpclient.*;
import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
public class Downloader {
private static final Logger log = LogManager.getLogger("Downloader");
private static final String DOWNLOAD_FOLDER = "C:/down2/";
private File root;
int cur = 0;
int max = 0;
AsyncHttpClient client;
public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException {
Downloader downloader = new Downloader();
log.info("Downloader started: " + LocalDateTime.now());
downloader.start();
log.info("Downloader finished: " + LocalDateTime.now());
}
public Downloader() {
client = Dsl.asyncHttpClient();
}
private void start() throws IOException, InterruptedException {
root = new ModelHandler().restore();
List<File> files = findFiles(root);
max = files.size();
for(File f : files) {
cur++;
download(f);
java.lang.Thread.sleep(1000);
}
}
private void download(File f) throws FileNotFoundException {
java.io.File target = new java.io.File(DOWNLOAD_FOLDER + f.getPath("/"));
target.getParentFile().mkdirs();
log.info("Downloading '"+target.getAbsolutePath()+"' ("+cur+" of "+max+") from " + f.getDownloadLink());
FileOutputStream stream = new FileOutputStream(target);
client.prepareGet(f.getDownloadLink()).execute(new AsyncCompletionHandler<FileOutputStream>() {
@Override
public State onBodyPartReceived(HttpResponseBodyPart bodyPart)
throws Exception {
stream.getChannel().write(bodyPart.getBodyByteBuffer());
return State.CONTINUE;
}
@Override
public FileOutputStream onCompleted(Response response)
throws Exception {
return stream;
}
});
target.setLastModified(f.getLastModified());
}
private List<File> findFiles(File files) {
List<File> found = new ArrayList<>();
for(File f : files.getChildren()) {
if(f.isFolder())
found.addAll(findFiles(f));
else
found.add(f);
}
return found;
}
// full download
// incremental download
}

View file

@ -0,0 +1,58 @@
package de.avatic.taricdb.scraper;
import com.google.gson.*;
import de.avatic.taricdb.scraper.model.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
public class ModelHandler {
private final Gson gson;
public ModelHandler() {
gson = new GsonBuilder()
.registerTypeAdapter(LocalDateTime.class, (JsonSerializer<LocalDateTime>) (obj, type, jsonSerializationContext) -> new JsonPrimitive(obj.format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)))
.registerTypeAdapter(LocalDateTime.class, (JsonDeserializer<LocalDateTime>) (json, type, jsonDeserializationContext) -> LocalDateTime.parse(json.getAsJsonPrimitive().getAsString(), DateTimeFormatter.ISO_LOCAL_DATE_TIME))
.create();
}
public void save(File files) throws IOException {
toFile(gson.toJson(files));
}
public File restore() throws IOException {
return restoreParents(gson.fromJson(fromFile(), File.class));
}
private File restoreParents(File fs) {
for(File f : fs.getChildren()) {
f.setParent(fs);
restoreParents(f);
}
return fs;
}
private String fromFile() throws IOException {
FileInputStream inputStream = new FileInputStream("files.json");
byte[] b = inputStream.readAllBytes();
inputStream.close();
return new String(b, StandardCharsets.UTF_8);
}
private void toFile(String data) throws IOException {
FileOutputStream outputStream = new FileOutputStream("files.json");
byte[] b = data.getBytes();
outputStream.write(b);
outputStream.close();
}
}

View file

@ -0,0 +1,141 @@
package de.avatic.taricdb.scraper;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import de.avatic.taricdb.scraper.model.File;
import de.avatic.taricdb.scraper.model.FileId;
import de.avatic.taricdb.scraper.model.FileIdType;
public class Scraper {
public static final String PATTERN = ".xlsx";
private static final Logger log = LogManager.getLogger("Scraper");
private final WebDriver driver;
private FileId rootId;
private File root;
public Scraper(FileId rootId) {
this.driver = new ChromeDriver();
this.rootId = rootId;
}
public static void main(String[] args) throws IOException, URISyntaxException {
Scraper scraper = new Scraper(new FileId(FileId.ROOT_FILE_ID, FileIdType.FOLDER));
log.info("Scraper started: " + LocalDateTime.now());
scraper.start();
log.info("Scraper finished: " + LocalDateTime.now());
}
private void start() throws IOException, URISyntaxException {
root = new File(FileId.ROOT_FILE_NAME, LocalDateTime.now(), this.rootId, null);
try {
scanPage(root);
}
catch(IOException e)
{
log.fatal(e.getStackTrace());
}
new ModelHandler().save(root);
driver.close();
driver.quit();
}
private void resume() {
// read json file.
// find dead ends
// scanPage for all dead ends
// save json file
}
private void scanPage(File parent) throws MalformedURLException, URISyntaxException {
int pageIndex = 1;
boolean hasPage = true;
while (hasPage) {
int curRow = 0, rowsFound = 0;
String page = parent.getPage(pageIndex);
log.info("Open folder '" + parent.getPath(" > ") + "' page " + pageIndex + " (url=" + page + ")");
driver.get(page);
List<WebElement> fileNames = find("//div[contains(@class, 'file-name')]/a");
List<WebElement> lastChange = find("//span[contains(@class, 'date')]");
rowsFound = fileNames.size();
if ((0 == rowsFound || lastChange.size() != rowsFound) && 1 == pageIndex) {
int retries = 3;
while (0 != retries--) {
log.warn("Empty page. Retry ... ");
driver.get(page);
driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(10));
fileNames = find("//div[contains(@class, 'file-name')]/a");
lastChange = find("//span[contains(@class, 'date')]");
rowsFound = fileNames.size();
if (rowsFound > 0 && lastChange.size() == rowsFound)
break;
}
}
for (WebElement fileName : fileNames) {
File file = parent.createChildFromWebElement(fileName, lastChange.get(curRow));
log.info("Found " + (file.isFolder() ? "folder" : "file") + " '" + file.getName() + "' (" + ++curRow + " of " + rowsFound + ") in " + parent.getName());
}
pageIndex++;
if (rowsFound < FileId.FILES_PER_PAGE)
hasPage = false;
}
for(File file : parent.getChildren()) {
if(file.isFolder())
scanPage(file);
}
}
private List<WebElement> find(String xpath) {
int retries = 5;
while (0 != retries--) {
try {
return driver.findElements(By.xpath(xpath));
} catch (NoSuchElementException e) {
return null;
} catch (StaleElementReferenceException e) {
log.info("Wait for page to load ... (" + retries + "/5)");
driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(1));
}
}
return null;
}
}

View file

@ -0,0 +1,88 @@
package de.avatic.taricdb.scraper.model;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import org.openqa.selenium.WebElement;
public class File {
private final FileId id;
private final String name;
private transient File parent;
private final List<File> children = new ArrayList<>();
private final LocalDateTime lastChanged;
public File(String name, LocalDateTime lastChanged, FileId fileId, File parent) {
this.name = name;
this.parent = parent;
this.id = fileId;
this.lastChanged = lastChanged;
}
public void addChild(File file) {
this.children.add(file);
}
public String getName() {
return name;
}
public String getPath(String delimiter) {
if(null != parent)
return parent.getPath(delimiter) + delimiter + this.name;
return this.name;
}
public List<File> getChildren() {
return children;
}
public String getPage(int index) {
return id.toPageLink(index);
}
public boolean isFolder() {
return id.isFolder();
}
public File createChildFromWebElement(WebElement name, WebElement lastChanged) throws MalformedURLException, URISyntaxException {
File file = File.fromWebElement(name, lastChanged, this);
this.addChild(file);
return file;
}
public static File fromWebElement(WebElement name, WebElement lastChanged, File parent) throws MalformedURLException, URISyntaxException {
return new File(name.getText(), LocalDateTime.parse(lastChanged.getText(), DateTimeFormatter.ofPattern("yyyy MM dd, HH:mm")), FileId.fromUrl(name.getDomAttribute("href")), parent);
}
@Override
public String toString() {
return "File{" +
"id=" + id +
", name='" + name + '\'' +
", children=" + children.size() +
", lastChanged=" + lastChanged +
'}';
}
public void setParent(File parent) {
this.parent = parent;
}
public String getDownloadLink() {
return id.toDownloadLink();
}
public long getLastModified() {
return lastChanged.atZone(ZoneId.systemDefault()).toInstant().toEpochMilli();
}
}

View file

@ -0,0 +1,58 @@
package de.avatic.taricdb.scraper.model;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.List;
public class FileId {
public static final Integer FILES_PER_PAGE = 100;
private static final String BASE_URL = "https://circabc.europa.eu";
public static final String GROUP_ID = "0e5f18c2-4b2f-42e9-aed4-dfe50ae1263b";
public static final String ROOT_FILE_ID = "90dd4b94-39dd-45f4-8f33-ec9ef9820016";
public static final String ROOT_FILE_NAME = "TARIC and Quota data and information";
private static final String LIBRARY_URL = BASE_URL + "/ui/group/" + GROUP_ID + "/library";
private static final String DOWNLOAD_URL = BASE_URL + "/rest/download";
private final String id;
private final FileIdType type;
private FileId(String id) {
this.id = id;
this.type = FileIdType.FOLDER;
}
public FileId(String id, FileIdType type) {
this.id = id;
this.type = type;
}
public String toPageLink(int page) {
return LIBRARY_URL +"/" + id + "?p=" + page + "&n=" + FILES_PER_PAGE + "&sort=name_ASC";
}
public String toDownloadLink() {
return DOWNLOAD_URL + "/" + id;
}
public static FileId fromUrl(String urlString) throws MalformedURLException, URISyntaxException {
List<String> pathElements = Arrays.asList(new URI(BASE_URL).resolve(urlString).toURL().getPath().split("/"));
if(pathElements.getLast().equals("details"))
return new FileId(pathElements.get(pathElements.size()-2), FileIdType.FILE);
return new FileId(pathElements.getLast());
}
@Override
public String toString() {
return id;
}
public boolean isFolder() {
return type.equals(FileIdType.FOLDER);
}
}

View file

@ -0,0 +1,5 @@
package de.avatic.taricdb.scraper.model;
public enum FileIdType {
FILE, FOLDER
}

View file

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="WARN">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="DEBUG">
<AppenderRef ref="Console"/>
</Root>
</Loggers>
</Configuration>